Author: jerome
Date: Fri Feb 17 15:22:55 2006
New Revision: 378653

URL: http://svn.apache.org/viewcvs?rev=378653&view=rev
Log:
Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)

Modified:
    
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
    
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
    
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 Fri Feb 17 15:22:55 2006
@@ -13,38 +13,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.rtf;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-<<<<<<< .mine
-<<<<<<< .mine
-import org.apache.nutch.util.MetadataNames;
-
-=======
-import org.apache.nutch.util.NutchConf;
-=======
-import org.apache.hadoop.conf.Configuration;
->>>>>>> .r374853
->>>>>>> .r373941
+// JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Properties;
 
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+// RTF Parser imports
+import com.etranslate.tm.processing.rtf.ParseException;
 import com.etranslate.tm.processing.rtf.RTFParser;
 
+
 /**
  * A parser for RTF documents
  * 
  * @author Andy Hedges
  */
-public class RTFParseFactory implements Parser, MetadataNames {
+public class RTFParseFactory implements Parser {
 
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
     byte[] raw = content.getContent();
     Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
     RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
@@ -55,28 +59,31 @@
 
     try {
       rtfParser.parse();
-    } catch (com.etranslate.tm.processing.rtf.ParseException e) {
-      throw new ParseException("Exception parsing RTF document", e);
+    } catch (ParseException e) {
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_EXCEPTION,
+                               e.toString()).getEmptyParse(conf);
     }
 
-    Properties metadata = new Properties();
-    metadata.putAll(content.getMetadata());
-    metadata.putAll(delegate.getMetaData());
-    String title = metadata.getProperty(TITLE);
+    Metadata metadata = new Metadata();
+    metadata.setAll(delegate.getMetaData());
+    String title = metadata.get(DublinCore.TITLE);
 
     if (title != null) {
-        //(CM): Why remove the title metadata property here? Even 
-        //though it's stored in the ParseData, it still might be useful
-        //to have via this properties object?
-        //metadata.remove(title);
+      metadata.remove(DublinCore.TITLE);
     } else {
       title = "";
     }
 
     String text = delegate.getText();
 
-    return new ParseImpl(text, new ParseData(title, OutlinkExtractor
-        .getOutlinks(text, this.conf), metadata));
+    return new ParseImpl(text,
+                         new ParseData(ParseStatus.STATUS_SUCCESS,
+                                       title,
+                                       OutlinkExtractor
+        .                              getOutlinks(text, this.conf),
+                                       content.getMetadata(),
+                                       metadata));
   }
 
   public void setConf(Configuration conf) {

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
 Fri Feb 17 15:22:55 2006
@@ -16,27 +16,48 @@
 
 package org.apache.nutch.parse.rtf;
 
+// RTF Parser imports
 import com.etranslate.tm.processing.rtf.RTFParserDelegate;
 
+// JDK imports
 import java.util.Arrays;
 import java.util.List;
 import java.util.Properties;
 
-import org.apache.nutch.util.MetadataNames;
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Office;
+
 
 /**
  * A parser delegate for handling rtf events.
  * @author Andy Hedges
  */
-public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames 
{
+public class RTFParserDelegateImpl implements RTFParserDelegate {
 
   String tabs = "";
   Properties metadata = new Properties();
 
-  String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager",
-                              "company", "operator", "category", KEYWORDS,
-                              COMMENTS, "doccomm", "hlinkbase"};
-  String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"};
+  String[] META_NAMES_TEXT = {
+    DublinCore.TITLE,
+    DublinCore.SUBJECT,
+    Office.AUTHOR,
+    "manager",
+    "company",
+    "operator",
+    "category",
+    Office.KEYWORDS,
+    Office.COMMENTS,
+    "doccomm",
+    "hlinkbase"
+  };
+  
+  String[] META_NAMES_DATE = {
+    "creatim",
+    "creatim",
+    "printim",
+    "buptim"
+  };
 
   String metaName = "";
   List metaNamesText = Arrays.asList(META_NAMES_TEXT);

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
 Fri Feb 17 15:22:55 2006
@@ -16,33 +16,33 @@
 
 package org.apache.nutch.parse.rtf;
 
+// JUnit imports
 import junit.framework.TestCase;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
-<<<<<<< .mine
-<<<<<<< .mine
-import org.apache.nutch.util.MetadataNames;
-=======
-import org.apache.nutch.util.NutchConf;
-=======
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
 import org.apache.hadoop.conf.Configuration;
->>>>>>> .r374853
->>>>>>> .r373941
+import org.apache.hadoop.io.UTF8;
 
-import java.util.Properties;
 
 /**
  * Unit tests for TestRTFParser.  (Adapted from John Xing msword unit tests).
  *
  * @author Andy Hedges
  */
-public class TestRTFParser extends TestCase implements MetadataNames {
+public class TestRTFParser extends TestCase {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
@@ -72,16 +72,16 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getContent(urlString);
-
+    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+                      .getContent();
     parse = new ParseUtil(conf).parseByParserId("parse-rtf", content);
     String text = parse.getText();
     assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
 
     String title = parse.getData().getTitle();
-    Properties meta = parse.getData().getMetadata();
+    Metadata meta = parse.getData().getParseMeta();
     assertEquals("test rft document", title);
-    assertEquals("tests", meta.getProperty(SUBJECT));
+    assertEquals("tests", meta.get(DublinCore.SUBJECT));
 
 
 




-------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc. Do you grep through log files
for problems?  Stop!  Download the new AJAX search engine that makes
searching your log files as easy as surfing the  web.  DOWNLOAD SPLUNK!
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to