Author: jerome Date: Fri Feb 17 15:22:55 2006 New Revision: 378653 URL: http://svn.apache.org/viewcvs?rev=378653&view=rev Log: Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653&r1=378652&r2=378653&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Feb 17 15:22:55 2006 @@ -13,38 +13,42 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.rtf; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; -<<<<<<< .mine -<<<<<<< .mine -import org.apache.nutch.util.MetadataNames; - -======= -import org.apache.nutch.util.NutchConf; -======= -import org.apache.hadoop.conf.Configuration; ->>>>>>> .r374853 ->>>>>>> .r373941 +// JDK imports import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.Properties; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + +// RTF Parser imports +import com.etranslate.tm.processing.rtf.ParseException; import com.etranslate.tm.processing.rtf.RTFParser; + /** * A parser for RTF documents * * @author Andy Hedges */ -public class RTFParseFactory implements Parser, MetadataNames { +public class RTFParseFactory implements Parser { private Configuration conf; - public Parse getParse(Content content) throws ParseException { + public Parse getParse(Content content) { byte[] raw = content.getContent(); Reader reader = new InputStreamReader(new ByteArrayInputStream(raw)); RTFParserDelegateImpl delegate = new RTFParserDelegateImpl(); @@ -55,28 +59,31 @@ try { rtfParser.parse(); - } catch (com.etranslate.tm.processing.rtf.ParseException e) { - throw new ParseException("Exception parsing RTF document", e); + } catch (ParseException e) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + e.toString()).getEmptyParse(conf); } - Properties metadata = new Properties(); - metadata.putAll(content.getMetadata()); - metadata.putAll(delegate.getMetaData()); - String title = metadata.getProperty(TITLE); + Metadata metadata = new Metadata(); + metadata.setAll(delegate.getMetaData()); + String title = metadata.get(DublinCore.TITLE); if (title != null) { - //(CM): Why remove the title metadata property here? Even - //though it's stored in the ParseData, it still might be useful - //to have via this properties object? - //metadata.remove(title); + metadata.remove(DublinCore.TITLE); } else { title = ""; } String text = delegate.getText(); - return new ParseImpl(text, new ParseData(title, OutlinkExtractor - .getOutlinks(text, this.conf), metadata)); + return new ParseImpl(text, + new ParseData(ParseStatus.STATUS_SUCCESS, + title, + OutlinkExtractor + . getOutlinks(text, this.conf), + content.getMetadata(), + metadata)); } public void setConf(Configuration conf) { Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653&r1=378652&r2=378653&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java Fri Feb 17 15:22:55 2006 @@ -16,27 +16,48 @@ package org.apache.nutch.parse.rtf; +// RTF Parser imports import com.etranslate.tm.processing.rtf.RTFParserDelegate; +// JDK imports import java.util.Arrays; import java.util.List; import java.util.Properties; -import org.apache.nutch.util.MetadataNames; +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Office; + /** * A parser delegate for handling rtf events. * @author Andy Hedges */ -public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames { +public class RTFParserDelegateImpl implements RTFParserDelegate { String tabs = ""; Properties metadata = new Properties(); - String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager", - "company", "operator", "category", KEYWORDS, - COMMENTS, "doccomm", "hlinkbase"}; - String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"}; + String[] META_NAMES_TEXT = { + DublinCore.TITLE, + DublinCore.SUBJECT, + Office.AUTHOR, + "manager", + "company", + "operator", + "category", + Office.KEYWORDS, + Office.COMMENTS, + "doccomm", + "hlinkbase" + }; + + String[] META_NAMES_DATE = { + "creatim", + "creatim", + "printim", + "buptim" + }; String metaName = ""; List metaNamesText = Arrays.asList(META_NAMES_TEXT); Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=378653&r1=378652&r2=378653&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Fri Feb 17 15:22:55 2006 @@ -16,33 +16,33 @@ package org.apache.nutch.parse.rtf; +// JUnit imports import junit.framework.TestCase; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParserFactory; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolFactory; -<<<<<<< .mine -<<<<<<< .mine -import org.apache.nutch.util.MetadataNames; -======= -import org.apache.nutch.util.NutchConf; -======= +import org.apache.nutch.util.NutchConfiguration; + +// Hadoop imports import org.apache.hadoop.conf.Configuration; ->>>>>>> .r374853 ->>>>>>> .r373941 +import org.apache.hadoop.io.UTF8; -import java.util.Properties; /** * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). * * @author Andy Hedges */ -public class TestRTFParser extends TestCase implements MetadataNames { +public class TestRTFParser extends TestCase { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml @@ -72,16 +72,16 @@ Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + rtfFile; protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getContent(urlString); - + content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) + .getContent(); parse = new ParseUtil(conf).parseByParserId("parse-rtf", content); String text = parse.getText(); assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); String title = parse.getData().getTitle(); - Properties meta = parse.getData().getMetadata(); + Metadata meta = parse.getData().getParseMeta(); assertEquals("test rft document", title); - assertEquals("tests", meta.getProperty(SUBJECT)); + assertEquals("tests", meta.get(DublinCore.SUBJECT)); ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs