Author: mattmann
Date: Tue Jun 19 07:01:02 2007
New Revision: 548730
URL: http://svn.apache.org/viewvc?view=rev&rev=548730
Log:
fix for NUTCH-444
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java
lucene/nutch/trunk/src/plugin/feed/
lucene/nutch/trunk/src/plugin/feed/build.xml
lucene/nutch/trunk/src/plugin/feed/lib/
lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar (with props)
lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar (with props)
lucene/nutch/trunk/src/plugin/feed/plugin.xml
lucene/nutch/trunk/src/plugin/feed/sample/
lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss
lucene/nutch/trunk/src/plugin/feed/src/
lucene/nutch/trunk/src/plugin/feed/src/java/
lucene/nutch/trunk/src/plugin/feed/src/java/org/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
lucene/nutch/trunk/src/plugin/feed/src/test/
lucene/nutch/trunk/src/plugin/feed/src/test/org/
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/parse-plugins.xml
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jun 19 07:01:02 2007
@@ -47,6 +47,12 @@
15. NUTCH-502 - Bug in SegmentReader causes infinite loop.
(Ilya Vishnevsky via dogacan)
+
+16. NUTCH-444 Possibly use a different library to parse RSS feed for improved
+ performance and compatibility. This patch introduced a new plugin, feed,
+ that includes an index filter and a parse plugin for feeds that uses ROME.
+ There was discussion to remove parse-rss, in light of the feed plugin,
+ however, this patch does not explicitly remove parse-rss. (dogacan,
mattmann)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Jun 19 07:01:02 2007
@@ -35,7 +35,8 @@
</mimeType>
<mimeType name="application/rss+xml">
- <plugin id="parse-rss" />
+ <plugin id="parse-rss" />
+ <plugin id="feed" />
</mimeType>
<mimeType name="application/vnd.ms-excel">
@@ -171,6 +172,7 @@
<mimeType name="text/xml">
<plugin id="parse-html" />
<plugin id="parse-rss" />
+ <plugin id="feed" />
</mimeType>
<!-- Types for parse-ext plugin: required for unit tests to pass. -->
@@ -204,6 +206,8 @@
extension-id="org.apache.nutch.parse.pdf.PdfParser" />
<alias name="parse-rss"
extension-id="org.apache.nutch.parse.rss.RSSParser" />
+ <alias name="feed"
+ extension-id="org.apache.nutch.parse.feed.FeedParser" />
<alias name="parse-rtf"
extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" />
<alias name="parse-swf"
Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java Tue Jun 19
07:01:02 2007
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Feed property names extracted by the ROME library.
+ *
+ *
+ * @author mattmann
+ * @author dogacan
+ */
+public interface Feed {
+
+ public static final String FEED_AUTHOR = "author";
+
+ public static final String FEED_TAGS = "tag";
+
+ public static final String FEED_PUBLISHED = "published";
+
+ public static final String FEED_UPDATED = "updated";
+
+ public static final String FEED = "feed";
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue Jun
19 07:01:02 2007
@@ -36,7 +36,7 @@
*
*/
public class Metadata implements Writable, CreativeCommons,
-DublinCore, HttpHeaders, Nutch, Office {
+DublinCore, HttpHeaders, Nutch, Office, Feed {
/**
* A map of all metadata attributes.
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=548730&r1=548729&r2=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Jun 19 07:01:02 2007
@@ -28,6 +28,7 @@
<target name="deploy">
<ant dir="clustering-carrot2" target="deploy"/>
<ant dir="creativecommons" target="deploy"/>
+ <ant dir="feed" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="languageidentifier" target="deploy"/>
@@ -94,6 +95,7 @@
<ant dir="parse-oo" target="test"/>
<ant dir="parse-pdf" target="test"/>
<ant dir="parse-rss" target="test"/>
+ <ant dir="feed" target="test"/>
<!-- <ant dir="parse-rtf" target="test"/> -->
<ant dir="parse-swf" target="test"/>
<ant dir="parse-zip" target="test"/>
@@ -115,6 +117,7 @@
<ant dir="analysis-fr" target="clean"/>
<ant dir="clustering-carrot2" target="clean"/>
<ant dir="creativecommons" target="clean"/>
+ <ant dir="feed" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="languageidentifier" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/feed/build.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/build.xml?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/feed/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/feed/build.xml Tue Jun 19 07:01:02 2007
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project name="feed" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false"
+ dir="../nutch-extensionpoints" />
+ <ant target="deploy" inheritall="false" dir="../protocol-file" />
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data" />
+ <copy file="sample/rsstest.rss" todir="${build.test}/data" />
+</project>
Added: lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar?view=auto&rev=548730
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar?view=auto&rev=548730
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/feed/plugin.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/plugin.xml?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/feed/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/feed/plugin.xml Tue Jun 19 07:01:02 2007
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0"
+ provider-name="nutch.org">
+ <runtime>
+ <library name="feed.jar">
+ <export name="*" />
+ </library>
+ <library name="rome-0.9.jar" />
+ <library name="jdom.jar" />
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints" />
+ </requires>
+
+ <extension id="org.apache.nutch.parse.feed" name="Feed Parser"
+ point="org.apache.nutch.parse.Parser">
+
+ <implementation id="org.apache.nutch.parse.feed.FeedParser"
+ class="org.apache.nutch.parse.feed.FeedParser">
+ <parameter name="contentType" value="application/rss+xml" />
+ <parameter name="contentType" value="application/atom+xml" />
+ <parameter name="contentType" value="text/xml" />
+ <parameter name="pathSuffix" value="rss" />
+ </implementation>
+ </extension>
+ <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="FeedIndexingFilter"
+ class="org.apache.nutch.indexer.feed.FeedIndexingFilter" />
+ </extension>
+</plugin>
Added: lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss?view=auto&rev=548730
==============================================================================
--- lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss (added)
+++ lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss Tue Jun 19 07:01:02
2007
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<rss version="0.91">
+ <channel>
+ <title>TestChannel</title>
+ <link>http://test.channel.com/</link>
+ <description>Sample RSS File for Junit test</description>
+ <language>en-us</language>
+
+ <item>
+ <title>Home Page of Chris Mattmann</title>
+ <link>http://www-scf.usc.edu/~mattmann/</link>
+ <description>Chris Mattmann's home page</description>
+ </item>
+ <item>
+ <title>Awesome Open Source Search Engine</title>
+ <link>http://www.nutch.org/</link>
+ <description>Yup, that's what it is</description>
+ </item>
+ </channel>
+</rss>
Added:
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?view=auto&rev=548730
==============================================================================
---
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
(added)
+++
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
Tue Jun 19 07:01:02 2007
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.feed;
+
+//JDK imports
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.TimeZone;
+
+//APACHE imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+
+/**
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ *
+ * An [EMAIL PROTECTED] IndexingFilter} implementation to pull out the
+ * relevant extracted [EMAIL PROTECTED] Metadata} fields from the RSS feeds
+ * and into the index.
+ *
+ */
+public class FeedIndexingFilter implements IndexingFilter {
+
+ public static final String dateFormatStr = "yyyyMMddHHmm";
+
+ private Configuration conf;
+
+ private final static String PUBLISHED_DATE = "publishedDate";
+
+ private final static String UPDATED_DATE = "updatedDate";
+
+ /**
+ * Extracts out the relevant fields:
+ *
+ * <ul>
+ * <li>FEED_AUTHOR</li>
+ * <li>FEED_TAGS</li>
+ * <li>FEED_PUBLISHED</li>
+ * <li>FEED_UPDATED</li>
+ * <li>FEED</li>
+ * </ul>
+ *
+ * And sends them to the [EMAIL PROTECTED] Indexer} for indexing within the
Nutch
+ * index.
+ *
+ */
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum,
+ Inlinks inlinks) throws IndexingException {
+ ParseData parseData = parse.getData();
+ Metadata parseMeta = parseData.getParseMeta();
+
+ String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
+ String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
+ String published = parseMeta.get(Feed.FEED_PUBLISHED);
+ String updated = parseMeta.get(Feed.FEED_UPDATED);
+ String feed = parseMeta.get(Feed.FEED);
+
+ if (authors != null) {
+ for (String author : authors) {
+ doc.add(new Field(Feed.FEED_AUTHOR, author,
+ Field.Store.YES, Field.Index.TOKENIZED));
+ }
+ }
+
+ if (tags != null) {
+ for (String tag : tags) {
+ doc.add(new Field(Feed.FEED_TAGS, tag,
+ Field.Store.YES, Field.Index.TOKENIZED));
+ }
+ }
+
+ if (feed != null)
+ doc.add(new Field(Feed.FEED, feed, Field.Store.YES,
Field.Index.TOKENIZED));
+
+ SimpleDateFormat sdf = new SimpleDateFormat(dateFormatStr);
+ sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
+ if (published != null) {
+ Date date = new Date(Long.parseLong(published));
+ String dateString = sdf.format(date);
+ doc.add(new Field(PUBLISHED_DATE, dateString,
+ Field.Store.YES, Field.Index.NO_NORMS));
+ }
+
+ if (updated != null) {
+ Date date = new Date(Long.parseLong(updated));
+ String dateString = sdf.format(date);
+ doc.add(new Field(UPDATED_DATE, dateString,
+ Field.Store.YES, Field.Index.NO_NORMS));
+ }
+
+ return doc;
+ }
+
+ /**
+ * @return the [EMAIL PROTECTED] Configuration} object used to configure
+ * this [EMAIL PROTECTED] IndexingFilter}.
+ */
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * Sets the [EMAIL PROTECTED] Configuration} object used to configure this
+ * [EMAIL PROTECTED] IndexingFilter}.
+ *
+ * @param conf The [EMAIL PROTECTED] Configuration} object used to configure
+ * this [EMAIL PROTECTED] IndexingFilter}.
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+}
Added:
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?view=auto&rev=548730
==============================================================================
---
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
(added)
+++
lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
Tue Jun 19 07:01:02 2007
@@ -0,0 +1,364 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+// APACHE imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParserNotFound;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.xml.sax.InputSource;
+
+// ROME imports
+import com.sun.syndication.feed.synd.SyndCategory;
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.feed.synd.SyndPerson;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ *
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ *
+ * <p>
+ * A new RSS/ATOM [EMAIL PROTECTED] Parser} that rapidly parses all referenced
links
+ * and content present in the feed.
+ * </p>
+ *
+ */
+public class FeedParser implements Parser {
+
+ public static final String CHARSET_UTF8 = "charset=UTF-8";
+
+ public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
+ + CHARSET_UTF8;
+
+ public static final Log LOG = LogFactory
+ .getLog("org.apache.nutch.parse.feed");
+
+ private Configuration conf;
+
+ private ParserFactory parserFactory;
+
+ private URLNormalizers normalizers;
+
+ private URLFilters filters;
+
+ /**
+ * Parses the given feed and extracts out and parsers all linked items within
+ * the feed, using the underlying ROME feed parsing library.
+ *
+ * @param content
+ * A [EMAIL PROTECTED] Content} object representing the feed that
is being
+ * parsed by this [EMAIL PROTECTED] Parser}.
+ *
+ * @return A [EMAIL PROTECTED] ParseResult} containing all [EMAIL PROTECTED]
Parse}d feeds that
+ * were present in the feed file that this [EMAIL PROTECTED] Parser}
dealt with.
+ *
+ */
+ public ParseResult getParse(Content content) {
+ SyndFeed feed = null;
+ ParseResult parseResult = new ParseResult(content.getUrl());
+ try {
+ InputSource input = new InputSource(new ByteArrayInputStream(content
+ .getContent()));
+ SyndFeedInput feedInput = new SyndFeedInput();
+ feed = feedInput.build(input);
+ } catch (Exception e) {
+ // return empty parse
+ LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
+ + StringUtils.stringifyException(e));
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
+ }
+
+ List entries = feed.getEntries();
+ String feedLink = feed.getLink();
+ try {
+ feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
+ if (feedLink != null)
+ feedLink = filters.filter(feedLink);
+ } catch (Exception e) {
+ feedLink = null;
+ }
+
+ for (Iterator i = entries.iterator(); i.hasNext();) {
+ SyndEntry entry = (SyndEntry) i.next();
+ addToMap(parseResult, feed, feedLink, entry, content);
+ }
+
+ String feedDesc = stripTags(feed.getDescriptionEx());
+ String feedTitle = stripTags(feed.getTitleEx());
+
+ parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
+ new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
+ content.getMetadata()));
+
+ return parseResult;
+ }
+
+ /**
+ *
+ * Sets the [EMAIL PROTECTED] Configuration} object for this [EMAIL
PROTECTED] Parser}. This
+ * [EMAIL PROTECTED] Parser} expects the following configuration properties
to be set:
+ *
+ * <ul>
+ * <li>URLNormalizers - properties in the configuration object to set up the
+ * default url normalizers.</li>
+ * <li>URLFilters - properties in the configuration object to set up the
+ * default url filters.</li>
+ * </ul>
+ *
+ * @param conf
+ * The Hadoop [EMAIL PROTECTED] Configuration} object to use to
configure this
+ * [EMAIL PROTECTED] Parser}.
+ *
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.parserFactory = new ParserFactory(conf);
+ this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
+ this.filters = new URLFilters(conf);
+ }
+
+ /**
+ *
+ * @return The [EMAIL PROTECTED] Configuration} object used to configure this
+ * [EMAIL PROTECTED] Parser}.
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Runs a command line version of this [EMAIL PROTECTED] Parser}.
+ *
+ * @param args
+ * A single argument (expected at arg[0]) representing a path on the
+ * local filesystem that points to a feed file.
+ *
+ * @throws Exception
+ * If any error occurs.
+ */
+ public static void main(String[] args) throws Exception {
+ if (args.length != 1) {
+ System.err.println("Usage: FeedParser <feed>");
+ System.exit(1);
+ }
+ String name = args[0];
+ String url = "file:" + name;
+ Configuration conf = NutchConfiguration.create();
+ FeedParser parser = new FeedParser();
+ parser.setConf(conf);
+ File file = new File(name);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+ "application/rss+xml", new Metadata(), conf));
+ for (Entry<Text, Parse> entry : parseResult) {
+ System.out.println("key: " + entry.getKey());
+ Parse parse = entry.getValue();
+ System.out.println("data: " + parse.getData());
+ System.out.println("text: " + parse.getText() + "\n");
+ }
+ }
+
+ private void addToMap(ParseResult parseResult, SyndFeed feed,
+ String feedLink, SyndEntry entry, Content content) {
+ String link = entry.getLink(), text = null, title = null;
+ Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
+ Parse parse = null;
+ SyndContent description = entry.getDescription();
+
+ try {
+ link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
+ if (link != null)
+ link = filters.filter(link);
+ } catch (Exception e) {
+ return;
+ }
+
+ if (link == null)
+ return;
+
+ title = stripTags(entry.getTitleEx());
+
+ if (feedLink != null)
+ parseMeta.set("feed", feedLink);
+
+ addFields(parseMeta, contentMeta, feed, entry);
+
+ // some item descriptions contain markup text in them,
+ // so we temporarily set their content-type to parse them
+ // with another plugin
+ String contentType = contentMeta.get(Response.CONTENT_TYPE);
+
+ if (description != null)
+ text = description.getValue();
+
+ if (text == null) {
+ List contents = entry.getContents();
+ StringBuilder buf = new StringBuilder();
+ for (Iterator i = contents.iterator(); i.hasNext();) {
+ SyndContent syndContent = (SyndContent) i.next();
+ buf.append(syndContent.getValue());
+ }
+ text = buf.toString();
+ }
+
+ try {
+ Parser parser = parserFactory.getParsers(contentType, link)[0];
+ parse = parser.getParse(
+ new Content(link, link, text.getBytes(), contentType, contentMeta,
+ conf)).get(link);
+ } catch (ParserNotFound e) { /* ignore */
+ }
+
+ if (parse != null) {
+ ParseData data = parse.getData();
+ data.getContentMeta().remove(Response.CONTENT_TYPE);
+ mergeMetadata(data.getParseMeta(), parseMeta);
+ parseResult.put(link, new ParseText(parse.getText()), new ParseData(
+ ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
+ .getContentMeta(), data.getParseMeta()));
+ } else {
+ contentMeta.remove(Response.CONTENT_TYPE);
+ parseResult.put(link, new ParseText(text), new ParseData(
+ ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
+ parseMeta));
+ }
+
+ }
+
+ private static String stripTags(SyndContent c) {
+ if (c == null)
+ return "";
+
+ String value = c.getValue();
+
+ String[] parts = value.split("<[^>]*>");
+ StringBuffer buf = new StringBuffer();
+
+ for (String part : parts)
+ buf.append(part);
+
+ return buf.toString().trim();
+ }
+
+ private void addFields(Metadata parseMeta, Metadata contentMeta,
+ SyndFeed feed, SyndEntry entry) {
+ List authors = entry.getAuthors(), categories = entry.getCategories();
+ Date published = entry.getPublishedDate(), updated =
entry.getUpdatedDate();
+ String contentType = null;
+
+ if (authors != null) {
+ for (Object o : authors) {
+ SyndPerson author = (SyndPerson) o;
+ String authorName = author.getName();
+ if (checkString(authorName)) {
+ parseMeta.add(Feed.FEED_AUTHOR, authorName);
+ }
+ }
+ } else {
+ // getAuthors may return null if feed is non-atom
+ // if so, call getAuthor to get Dublin Core module creator.
+ String authorName = entry.getAuthor();
+ if (checkString(authorName)) {
+ parseMeta.set(Feed.FEED_AUTHOR, authorName);
+ }
+ }
+
+ for (Iterator i = categories.iterator(); i.hasNext();) {
+ parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i.next()).getName());
+ }
+
+ if (published != null) {
+ parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
+ }
+ if (updated != null) {
+ parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
+ }
+
+ SyndContent description = entry.getDescription();
+ if (description != null) {
+ contentType = description.getType();
+ } else {
+ // TODO: What to do if contents.size() > 1?
+ List contents = entry.getContents();
+ if (contents.size() > 0) {
+ contentType = ((SyndContent) contents.get(0)).getType();
+ }
+ }
+
+ if (checkString(contentType)) {
+ // ROME may return content-type as html
+ if (contentType.equals("html"))
+ contentType = "text/html";
+ else if (contentType.equals("xhtml"))
+ contentType = "text/xhtml";
+ contentMeta.set(Response.CONTENT_TYPE, contentType + "; " +
CHARSET_UTF8);
+ } else {
+ contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
+ }
+
+ }
+
+ private void mergeMetadata(Metadata first, Metadata second) {
+ for (String name : second.names()) {
+ String[] values = second.getValues(name);
+ for (String value : values) {
+ first.add(name, value);
+ }
+ }
+ }
+
+ private boolean checkString(String s) {
+ return s != null && !s.equals("");
+ }
+
+}
Added:
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?view=auto&rev=548730
==============================================================================
---
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
(added)
+++
lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
Tue Jun 19 07:01:02 2007
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.Map;
+
+// APACHE imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Junit imports
+import junit.framework.TestCase;
+
+/**
+ *
+ * @author mattmann
+ *
+ * Test Suite for the [EMAIL PROTECTED] FeedParser}.
+ *
+ */
+public class TestFeedParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/feed/build.xml during plugin compilation.
+
+ private String[] sampleFiles = { "rsstest.rss" };
+
+ public static final Log LOG = LogFactory.getLog(TestFeedParser.class
+ .getName());
+
+ /**
+ * Default Constructor.
+ *
+ * @param name
+ * The name of this [EMAIL PROTECTED] TestCase}.
+ */
+ public TestFeedParser(String name) {
+ super(name);
+ }
+
+ /**
+ * Calls the [EMAIL PROTECTED] FeedParser} on a sample RSS file and checks
that there are
+ * 3 [EMAIL PROTECTED] ParseResult} entries including the below 2 links:
+ * <ul>
+ * <li>http://www-scf.usc.edu/~mattmann/</li>
+ * <li>http://www.nutch.org</li>
+ * </ul>
+ *
+ *
+ * @throws ProtocolNotFound
+ * If the [EMAIL PROTECTED] Protocol}Layer cannot be loaded
(required to fetch
+ * the [EMAIL PROTECTED] Content} for the RSS file).
+ * @throws ParseException
+ * If the [EMAIL PROTECTED] Parser}Layer cannot be loaded.
+ */
+ public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ ParseResult parseResult;
+
+ Configuration conf = NutchConfiguration.create();
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
+
+ assertEquals(3, parseResult.size());
+
+ boolean hasLink1 = false, hasLink2 = false, hasLink3=false;
+
+ for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
+ .hasNext();) {
+ Map.Entry<Text, Parse> entry = j.next();
+ if (entry.getKey().toString().equals(
+ "http://www-scf.usc.edu/~mattmann/")) {
+ hasLink1 = true;
+ } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
+ hasLink2 = true;
+ }
+ else if(entry.getKey().toString().equals(urlString)){
+ hasLink3 = true;
+ }
+
+ assertNotNull(entry.getValue());
+ assertNotNull(entry.getValue().getData());
+ }
+
+ if (!hasLink1 || !hasLink2 || !hasLink3) {
+ fail("Outlinks read from sample rss file are not correct!");
+ }
+ }
+
+ }
+
+}