plugin...

mattmann Tue, 19 Jun 2007 07:01:37 -0700

Author: mattmann
Date: Tue Jun 19 07:01:02 2007
New Revision: 548730

URL: http://svn.apache.org/viewvc?view=rev&rev=548730
Log:
fix for NUTCH-444


Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java
    lucene/nutch/trunk/src/plugin/feed/
    lucene/nutch/trunk/src/plugin/feed/build.xml
    lucene/nutch/trunk/src/plugin/feed/lib/
    lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar   (with props)
    lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar   (with props)
    lucene/nutch/trunk/src/plugin/feed/plugin.xml
    lucene/nutch/trunk/src/plugin/feed/sample/
    lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss
    lucene/nutch/trunk/src/plugin/feed/src/
    lucene/nutch/trunk/src/plugin/feed/src/java/
    lucene/nutch/trunk/src/plugin/feed/src/java/org/
    lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/
    lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/
    
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
    lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/
    
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
    lucene/nutch/trunk/src/plugin/feed/src/test/
    lucene/nutch/trunk/src/plugin/feed/src/test/org/
    lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/
    
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/parse-plugins.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jun 19 07:01:02 2007
@@ -47,6 +47,12 @@
 
 15. NUTCH-502 - Bug in SegmentReader causes infinite loop. 
     (Ilya Vishnevsky via dogacan)
+    
+16. NUTCH-444 Possibly use a different library to parse RSS feed for improved 
+    performance and compatibility. This patch introduced a new plugin, feed,
+    that includes an index filter and a parse plugin for feeds that uses ROME.
+    There was discussion to remove parse-rss, in light of the feed plugin, 
+    however, this patch does not explicitly remove parse-rss. (dogacan, 
mattmann)
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Jun 19 07:01:02 2007
@@ -35,7 +35,8 @@
        </mimeType>
 
        <mimeType name="application/rss+xml">
-               <plugin id="parse-rss" />
+           <plugin id="parse-rss" />
+           <plugin id="feed" />
        </mimeType>
 
        <mimeType name="application/vnd.ms-excel">
@@ -171,6 +172,7 @@
        <mimeType name="text/xml">
                <plugin id="parse-html" />
                <plugin id="parse-rss" />
+        <plugin id="feed" />
        </mimeType>
 
        <!-- Types for parse-ext plugin: required for unit tests to pass. -->
@@ -204,6 +206,8 @@
                        extension-id="org.apache.nutch.parse.pdf.PdfParser" />
                <alias name="parse-rss"
                        extension-id="org.apache.nutch.parse.rss.RSSParser" />
+        <alias name="feed"
+            extension-id="org.apache.nutch.parse.feed.FeedParser" />
                <alias name="parse-rtf"
                        
extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" />
                <alias name="parse-swf"

Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java Tue Jun 19 
07:01:02 2007
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Feed property names extracted by the ROME library.
+ * 
+ * 
+ * @author mattmann
+ * @author dogacan
+ */
+public interface Feed {
+
+  public static final String FEED_AUTHOR = "author";
+
+  public static final String FEED_TAGS = "tag";
+
+  public static final String FEED_PUBLISHED = "published";
+
+  public static final String FEED_UPDATED = "updated";
+
+  public static final String FEED = "feed";
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue Jun 
19 07:01:02 2007
@@ -36,7 +36,7 @@
  *
  */
 public class Metadata implements Writable, CreativeCommons,
-DublinCore, HttpHeaders, Nutch, Office {
+DublinCore, HttpHeaders, Nutch, Office, Feed {
 
   /**
    * A map of all metadata attributes.

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Jun 19 07:01:02 2007
@@ -28,6 +28,7 @@
   <target name="deploy">
      <ant dir="clustering-carrot2" target="deploy"/>
      <ant dir="creativecommons" target="deploy"/>
+     <ant dir="feed" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="languageidentifier" target="deploy"/>
@@ -94,6 +95,7 @@
      <ant dir="parse-oo" target="test"/>
      <ant dir="parse-pdf" target="test"/>
      <ant dir="parse-rss" target="test"/>
+     <ant dir="feed" target="test"/>
  <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-zip" target="test"/>
@@ -115,6 +117,7 @@
     <ant dir="analysis-fr" target="clean"/>
     <ant dir="clustering-carrot2" target="clean"/>
     <ant dir="creativecommons" target="clean"/>
+    <ant dir="feed" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="languageidentifier" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/feed/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/build.xml?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/feed/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/feed/build.xml Tue Jun 19 07:01:02 2007
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<!--
+       Licensed to the Apache Software Foundation (ASF) under one or more
+       contributor license agreements.  See the NOTICE file distributed with
+       this work for additional information regarding copyright ownership.
+       The ASF licenses this file to You under the Apache License, Version 2.0
+       (the "License"); you may not use this file except in compliance with
+       the License.  You may obtain a copy of the License at
+       
+       http://www.apache.org/licenses/LICENSE-2.0
+       
+       Unless required by applicable law or agreed to in writing, software
+       distributed under the License is distributed on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+       See the License for the specific language governing permissions and
+       limitations under the License.
+-->
+
+<project name="feed" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+    
+    <!-- Deploy Unit test dependencies -->
+    <target name="deps-test">
+      <ant target="deploy" inheritall="false"
+           dir="../nutch-extensionpoints" />
+      <ant target="deploy" inheritall="false" dir="../protocol-file" />
+    </target>
+    
+    <!-- for junit test -->
+    <mkdir dir="${build.test}/data" />
+    <copy file="sample/rsstest.rss" todir="${build.test}/data" />
+</project>

Added: lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar?view=auto&rev=548730
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar?view=auto&rev=548730
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/feed/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/plugin.xml?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/feed/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/feed/plugin.xml Tue Jun 19 07:01:02 2007
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<!--
+       Licensed to the Apache Software Foundation (ASF) under one or more
+       contributor license agreements.  See the NOTICE file distributed with
+       this work for additional information regarding copyright ownership.
+       The ASF licenses this file to You under the Apache License, Version 2.0
+       (the "License"); you may not use this file except in compliance with
+       the License.  You may obtain a copy of the License at
+       
+       http://www.apache.org/licenses/LICENSE-2.0
+       
+       Unless required by applicable law or agreed to in writing, software
+       distributed under the License is distributed on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+       See the License for the specific language governing permissions and
+       limitations under the License.
+-->
+<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0"
+        provider-name="nutch.org">
+    <runtime>
+      <library name="feed.jar">
+        <export name="*" />
+      </library>
+      <library name="rome-0.9.jar" />
+      <library name="jdom.jar" />
+    </runtime>
+    
+    <requires>
+      <import plugin="nutch-extensionpoints" />
+    </requires>
+    
+    <extension id="org.apache.nutch.parse.feed" name="Feed Parser"
+      point="org.apache.nutch.parse.Parser">
+      
+      <implementation id="org.apache.nutch.parse.feed.FeedParser"
+         class="org.apache.nutch.parse.feed.FeedParser">
+         <parameter name="contentType" value="application/rss+xml" />
+         <parameter name="contentType" value="application/atom+xml" />
+         <parameter name="contentType" value="text/xml" />
+         <parameter name="pathSuffix" value="rss" />
+     </implementation>
+    </extension>
+    <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer"
+       point="org.apache.nutch.indexer.IndexingFilter">
+     <implementation id="FeedIndexingFilter"
+       class="org.apache.nutch.indexer.feed.FeedIndexingFilter" />
+    </extension>
+</plugin>

Added: lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss (added)
+++ lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss Tue Jun 19 07:01:02 
2007
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+       Licensed to the Apache Software Foundation (ASF) under one or more
+       contributor license agreements.  See the NOTICE file distributed with
+       this work for additional information regarding copyright ownership.
+       The ASF licenses this file to You under the Apache License, Version 2.0
+       (the "License"); you may not use this file except in compliance with
+       the License.  You may obtain a copy of the License at
+       
+       http://www.apache.org/licenses/LICENSE-2.0
+       
+       Unless required by applicable law or agreed to in writing, software
+       distributed under the License is distributed on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+       See the License for the specific language governing permissions and
+       limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

Added: 
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?view=auto&rev=548730
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
 Tue Jun 19 07:01:02 2007
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.feed;
+
+//JDK imports
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.TimeZone;
+
+//APACHE imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+
+/**
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ * 
+ * An [EMAIL PROTECTED] IndexingFilter} implementation to pull out the
+ * relevant extracted [EMAIL PROTECTED] Metadata} fields from the RSS feeds
+ * and into the index.
+ *
+ */
+public class FeedIndexingFilter implements IndexingFilter {
+  
+  public static final String dateFormatStr = "yyyyMMddHHmm";
+  
+  private Configuration conf;
+  
+  private final static String PUBLISHED_DATE = "publishedDate";
+  
+  private final static String UPDATED_DATE = "updatedDate";
+  
+  /**
+   * Extracts out the relevant fields:
+   * 
+   * <ul>
+   *  <li>FEED_AUTHOR</li>
+   *  <li>FEED_TAGS</li>
+   *  <li>FEED_PUBLISHED</li>
+   *  <li>FEED_UPDATED</li>
+   *  <li>FEED</li>
+   * </ul>
+   * 
+   * And sends them to the [EMAIL PROTECTED] Indexer} for indexing within the 
Nutch
+   * index.
+   *  
+   */
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum,
+                         Inlinks inlinks) throws IndexingException {
+    ParseData parseData = parse.getData();
+    Metadata parseMeta = parseData.getParseMeta();
+    
+    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
+    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
+    String published = parseMeta.get(Feed.FEED_PUBLISHED);
+    String updated = parseMeta.get(Feed.FEED_UPDATED);
+    String feed = parseMeta.get(Feed.FEED);
+    
+    if (authors != null) {
+      for (String author : authors) {
+        doc.add(new Field(Feed.FEED_AUTHOR, author, 
+            Field.Store.YES, Field.Index.TOKENIZED));
+      }
+    }
+    
+    if (tags != null) {
+      for (String tag : tags) {
+        doc.add(new Field(Feed.FEED_TAGS, tag, 
+            Field.Store.YES, Field.Index.TOKENIZED));
+      }
+    }
+    
+    if (feed != null)
+      doc.add(new Field(Feed.FEED, feed, Field.Store.YES, 
Field.Index.TOKENIZED));
+    
+    SimpleDateFormat sdf = new SimpleDateFormat(dateFormatStr);
+    sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
+    if (published != null) {
+      Date date = new Date(Long.parseLong(published));
+      String dateString = sdf.format(date);
+      doc.add(new Field(PUBLISHED_DATE, dateString, 
+                        Field.Store.YES, Field.Index.NO_NORMS));
+    }
+    
+    if (updated != null) {
+      Date date = new Date(Long.parseLong(updated));
+      String dateString = sdf.format(date);
+      doc.add(new Field(UPDATED_DATE, dateString, 
+                        Field.Store.YES, Field.Index.NO_NORMS));
+    }
+        
+    return doc;
+  }
+
+  /**
+   * @return the [EMAIL PROTECTED] Configuration} object used to configure
+   * this [EMAIL PROTECTED] IndexingFilter}.
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Sets the [EMAIL PROTECTED] Configuration} object used to configure this
+   * [EMAIL PROTECTED] IndexingFilter}.
+   * 
+   * @param conf The [EMAIL PROTECTED] Configuration} object used to configure
+   * this [EMAIL PROTECTED] IndexingFilter}.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+}

Added: 
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?view=auto&rev=548730
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
 Tue Jun 19 07:01:02 2007
@@ -0,0 +1,364 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+// APACHE imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParserNotFound;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.xml.sax.InputSource;
+
+// ROME imports
+import com.sun.syndication.feed.synd.SyndCategory;
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.feed.synd.SyndPerson;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * 
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ * 
+ * <p>
+ * A new RSS/ATOM [EMAIL PROTECTED] Parser} that rapidly parses all referenced 
links
+ * and content present in the feed.
+ * </p>
+ * 
+ */
+public class FeedParser implements Parser {
+
+  public static final String CHARSET_UTF8 = "charset=UTF-8";
+
+  public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
+      + CHARSET_UTF8;
+
+  public static final Log LOG = LogFactory
+      .getLog("org.apache.nutch.parse.feed");
+
+  private Configuration conf;
+
+  private ParserFactory parserFactory;
+
+  private URLNormalizers normalizers;
+
+  private URLFilters filters;
+
+  /**
+   * Parses the given feed and extracts out and parsers all linked items within
+   * the feed, using the underlying ROME feed parsing library.
+   * 
+   * @param content
+   *          A [EMAIL PROTECTED] Content} object representing the feed that 
is being
+   *          parsed by this [EMAIL PROTECTED] Parser}.
+   * 
+   * @return A [EMAIL PROTECTED] ParseResult} containing all [EMAIL PROTECTED] 
Parse}d feeds that
+   *         were present in the feed file that this [EMAIL PROTECTED] Parser} 
dealt with.
+   * 
+   */
+  public ParseResult getParse(Content content) {
+    SyndFeed feed = null;
+    ParseResult parseResult = new ParseResult(content.getUrl());
+    try {
+      InputSource input = new InputSource(new ByteArrayInputStream(content
+          .getContent()));
+      SyndFeedInput feedInput = new SyndFeedInput();
+      feed = feedInput.build(input);
+    } catch (Exception e) {
+      // return empty parse
+      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
+          + StringUtils.stringifyException(e));
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    List entries = feed.getEntries();
+    String feedLink = feed.getLink();
+    try {
+      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
+      if (feedLink != null)
+        feedLink = filters.filter(feedLink);
+    } catch (Exception e) {
+      feedLink = null;
+    }
+
+    for (Iterator i = entries.iterator(); i.hasNext();) {
+      SyndEntry entry = (SyndEntry) i.next();
+      addToMap(parseResult, feed, feedLink, entry, content);
+    }
+
+    String feedDesc = stripTags(feed.getDescriptionEx());
+    String feedTitle = stripTags(feed.getTitleEx());
+
+    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
+        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
+        content.getMetadata()));
+
+    return parseResult;
+  }
+
+  /**
+   * 
+   * Sets the [EMAIL PROTECTED] Configuration} object for this [EMAIL 
PROTECTED] Parser}. This
+   * [EMAIL PROTECTED] Parser} expects the following configuration properties 
to be set:
+   * 
+   * <ul>
+   * <li>URLNormalizers - properties in the configuration object to set up the
+   * default url normalizers.</li>
+   * <li>URLFilters - properties in the configuration object to set up the
+   * default url filters.</li>
+   * </ul>
+   * 
+   * @param conf
+   *          The Hadoop [EMAIL PROTECTED] Configuration} object to use to 
configure this
+   *          [EMAIL PROTECTED] Parser}.
+   * 
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.parserFactory = new ParserFactory(conf);
+    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
+    this.filters = new URLFilters(conf);
+  }
+
+  /**
+   * 
+   * @return The [EMAIL PROTECTED] Configuration} object used to configure this
+   *         [EMAIL PROTECTED] Parser}.
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Runs a command line version of this [EMAIL PROTECTED] Parser}.
+   * 
+   * @param args
+   *          A single argument (expected at arg[0]) representing a path on the
+   *          local filesystem that points to a feed file.
+   * 
+   * @throws Exception
+   *           If any error occurs.
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length != 1) {
+      System.err.println("Usage: FeedParser <feed>");
+      System.exit(1);
+    }
+    String name = args[0];
+    String url = "file:" + name;
+    Configuration conf = NutchConfiguration.create();
+    FeedParser parser = new FeedParser();
+    parser.setConf(conf);
+    File file = new File(name);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/rss+xml", new Metadata(), conf));
+    for (Entry<Text, Parse> entry : parseResult) {
+      System.out.println("key: " + entry.getKey());
+      Parse parse = entry.getValue();
+      System.out.println("data: " + parse.getData());
+      System.out.println("text: " + parse.getText() + "\n");
+    }
+  }
+
+  private void addToMap(ParseResult parseResult, SyndFeed feed,
+      String feedLink, SyndEntry entry, Content content) {
+    String link = entry.getLink(), text = null, title = null;
+    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
+    Parse parse = null;
+    SyndContent description = entry.getDescription();
+
+    try {
+      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
+      if (link != null)
+        link = filters.filter(link);
+    } catch (Exception e) {
+      return;
+    }
+
+    if (link == null)
+      return;
+
+    title = stripTags(entry.getTitleEx());
+
+    if (feedLink != null)
+      parseMeta.set("feed", feedLink);
+
+    addFields(parseMeta, contentMeta, feed, entry);
+
+    // some item descriptions contain markup text in them,
+    // so we temporarily set their content-type to parse them
+    // with another plugin
+    String contentType = contentMeta.get(Response.CONTENT_TYPE);
+
+    if (description != null)
+      text = description.getValue();
+
+    if (text == null) {
+      List contents = entry.getContents();
+      StringBuilder buf = new StringBuilder();
+      for (Iterator i = contents.iterator(); i.hasNext();) {
+        SyndContent syndContent = (SyndContent) i.next();
+        buf.append(syndContent.getValue());
+      }
+      text = buf.toString();
+    }
+
+    try {
+      Parser parser = parserFactory.getParsers(contentType, link)[0];
+      parse = parser.getParse(
+          new Content(link, link, text.getBytes(), contentType, contentMeta,
+              conf)).get(link);
+    } catch (ParserNotFound e) { /* ignore */
+    }
+
+    if (parse != null) {
+      ParseData data = parse.getData();
+      data.getContentMeta().remove(Response.CONTENT_TYPE);
+      mergeMetadata(data.getParseMeta(), parseMeta);
+      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
+          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
+              .getContentMeta(), data.getParseMeta()));
+    } else {
+      contentMeta.remove(Response.CONTENT_TYPE);
+      parseResult.put(link, new ParseText(text), new ParseData(
+          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
+          parseMeta));
+    }
+
+  }
+
+  private static String stripTags(SyndContent c) {
+    if (c == null)
+      return "";
+
+    String value = c.getValue();
+
+    String[] parts = value.split("<[^>]*>");
+    StringBuffer buf = new StringBuffer();
+
+    for (String part : parts)
+      buf.append(part);
+
+    return buf.toString().trim();
+  }
+
+  private void addFields(Metadata parseMeta, Metadata contentMeta,
+      SyndFeed feed, SyndEntry entry) {
+    List authors = entry.getAuthors(), categories = entry.getCategories();
+    Date published = entry.getPublishedDate(), updated = 
entry.getUpdatedDate();
+    String contentType = null;
+
+    if (authors != null) {
+      for (Object o : authors) {
+        SyndPerson author = (SyndPerson) o;
+        String authorName = author.getName();
+        if (checkString(authorName)) {
+          parseMeta.add(Feed.FEED_AUTHOR, authorName);
+        }
+      }
+    } else {
+      // getAuthors may return null if feed is non-atom
+      // if so, call getAuthor to get Dublin Core module creator.
+      String authorName = entry.getAuthor();
+      if (checkString(authorName)) {
+        parseMeta.set(Feed.FEED_AUTHOR, authorName);
+      }
+    }
+
+    for (Iterator i = categories.iterator(); i.hasNext();) {
+      parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i.next()).getName());
+    }
+
+    if (published != null) {
+      parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
+    }
+    if (updated != null) {
+      parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
+    }
+
+    SyndContent description = entry.getDescription();
+    if (description != null) {
+      contentType = description.getType();
+    } else {
+      // TODO: What to do if contents.size() > 1?
+      List contents = entry.getContents();
+      if (contents.size() > 0) {
+        contentType = ((SyndContent) contents.get(0)).getType();
+      }
+    }
+
+    if (checkString(contentType)) {
+      // ROME may return content-type as html
+      if (contentType.equals("html"))
+        contentType = "text/html";
+      else if (contentType.equals("xhtml"))
+        contentType = "text/xhtml";
+      contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + 
CHARSET_UTF8);
+    } else {
+      contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
+    }
+
+  }
+
+  private void mergeMetadata(Metadata first, Metadata second) {
+    for (String name : second.names()) {
+      String[] values = second.getValues(name);
+      for (String value : values) {
+        first.add(name, value);
+      }
+    }
+  }
+
+  private boolean checkString(String s) {
+    return s != null && !s.equals("");
+  }
+
+}

Added: 
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?view=auto&rev=548730
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
 Tue Jun 19 07:01:02 2007
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.Map;
+
+// APACHE imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Junit imports
+import junit.framework.TestCase;
+
+/**
+ * 
+ * @author mattmann
+ * 
+ * Test Suite for the [EMAIL PROTECTED] FeedParser}.
+ * 
+ */
+public class TestFeedParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/feed/build.xml during plugin compilation.
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Log LOG = LogFactory.getLog(TestFeedParser.class
+      .getName());
+
+  /**
+   * Default Constructor.
+   * 
+   * @param name
+   *          The name of this [EMAIL PROTECTED] TestCase}.
+   */
+  public TestFeedParser(String name) {
+    super(name);
+  }
+
+  /**
+   * Calls the [EMAIL PROTECTED] FeedParser} on a sample RSS file and checks 
that there are
+   * 3 [EMAIL PROTECTED] ParseResult} entries including the below 2 links:
+   * <ul>
+   * <li>http://www-scf.usc.edu/~mattmann/</li>
+   * <li>http://www.nutch.org</li>
+   * </ul>
+   * 
+   * 
+   * @throws ProtocolNotFound
+   *           If the [EMAIL PROTECTED] Protocol}Layer cannot be loaded 
(required to fetch
+   *           the [EMAIL PROTECTED] Content} for the RSS file).
+   * @throws ParseException
+   *           If the [EMAIL PROTECTED] Parser}Layer cannot be loaded.
+   */
+  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    ParseResult parseResult;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
+
+      assertEquals(3, parseResult.size());
+
+      boolean hasLink1 = false, hasLink2 = false, hasLink3=false;
+
+      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
+          .hasNext();) {
+        Map.Entry<Text, Parse> entry = j.next();
+        if (entry.getKey().toString().equals(
+            "http://www-scf.usc.edu/~mattmann/";)) {
+          hasLink1 = true;
+        } else if (entry.getKey().toString().equals("http://www.nutch.org/";)) {
+          hasLink2 = true;
+        }
+        else if(entry.getKey().toString().equals(urlString)){
+          hasLink3 = true;
+        }
+
+        assertNotNull(entry.getValue());
+        assertNotNull(entry.getValue().getData());
+      }
+
+      if (!hasLink1 || !hasLink2 || !hasLink3) {
+        fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+
+  }
+
+}



-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r548730 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/metadata/ src/plugin/ src/plugin/feed/ src/plugin/feed/lib/ src/plugin/feed/sample/ src/plugin/feed/src/ src/plugin/feed/src/java/ src/plugin/feed/src/java/org/ src/plugin...

Reply via email to