Author: mattmann Date: Tue Jun 19 07:01:02 2007 New Revision: 548730 URL: http://svn.apache.org/viewvc?view=rev&rev=548730 Log: fix for NUTCH-444
Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java lucene/nutch/trunk/src/plugin/feed/ lucene/nutch/trunk/src/plugin/feed/build.xml lucene/nutch/trunk/src/plugin/feed/lib/ lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar (with props) lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar (with props) lucene/nutch/trunk/src/plugin/feed/plugin.xml lucene/nutch/trunk/src/plugin/feed/sample/ lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss lucene/nutch/trunk/src/plugin/feed/src/ lucene/nutch/trunk/src/plugin/feed/src/java/ lucene/nutch/trunk/src/plugin/feed/src/java/org/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java lucene/nutch/trunk/src/plugin/feed/src/test/ lucene/nutch/trunk/src/plugin/feed/src/test/org/ lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/ lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/ lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548730&r1=548729&r2=548730 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Jun 19 07:01:02 2007 @@ -47,6 +47,12 @@ 15. NUTCH-502 - Bug in SegmentReader causes infinite loop. (Ilya Vishnevsky via dogacan) + +16. NUTCH-444 Possibly use a different library to parse RSS feed for improved + performance and compatibility. This patch introduced a new plugin, feed, + that includes an index filter and a parse plugin for feeds that uses ROME. + There was discussion to remove parse-rss, in light of the feed plugin, + however, this patch does not explicitly remove parse-rss. (dogacan, mattmann) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?view=diff&rev=548730&r1=548729&r2=548730 ============================================================================== --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Jun 19 07:01:02 2007 @@ -35,7 +35,8 @@ </mimeType> <mimeType name="application/rss+xml"> - <plugin id="parse-rss" /> + <plugin id="parse-rss" /> + <plugin id="feed" /> </mimeType> <mimeType name="application/vnd.ms-excel"> @@ -171,6 +172,7 @@ <mimeType name="text/xml"> <plugin id="parse-html" /> <plugin id="parse-rss" /> + <plugin id="feed" /> </mimeType> <!-- Types for parse-ext plugin: required for unit tests to pass. --> @@ -204,6 +206,8 @@ extension-id="org.apache.nutch.parse.pdf.PdfParser" /> <alias name="parse-rss" extension-id="org.apache.nutch.parse.rss.RSSParser" /> + <alias name="feed" + extension-id="org.apache.nutch.parse.feed.FeedParser" /> <alias name="parse-rtf" extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" /> <alias name="parse-swf" Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Feed.java Tue Jun 19 07:01:02 2007 @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.metadata; + +/** + * A collection of Feed property names extracted by the ROME library. + * + * + * @author mattmann + * @author dogacan + */ +public interface Feed { + + public static final String FEED_AUTHOR = "author"; + + public static final String FEED_TAGS = "tag"; + + public static final String FEED_PUBLISHED = "published"; + + public static final String FEED_UPDATED = "updated"; + + public static final String FEED = "feed"; +} Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=548730&r1=548729&r2=548730 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue Jun 19 07:01:02 2007 @@ -36,7 +36,7 @@ * */ public class Metadata implements Writable, CreativeCommons, -DublinCore, HttpHeaders, Nutch, Office { +DublinCore, HttpHeaders, Nutch, Office, Feed { /** * A map of all metadata attributes. Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=548730&r1=548729&r2=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Jun 19 07:01:02 2007 @@ -28,6 +28,7 @@ <target name="deploy"> <ant dir="clustering-carrot2" target="deploy"/> <ant dir="creativecommons" target="deploy"/> + <ant dir="feed" target="deploy"/> <ant dir="index-basic" target="deploy"/> <ant dir="index-more" target="deploy"/> <ant dir="languageidentifier" target="deploy"/> @@ -94,6 +95,7 @@ <ant dir="parse-oo" target="test"/> <ant dir="parse-pdf" target="test"/> <ant dir="parse-rss" target="test"/> + <ant dir="feed" target="test"/> <!-- <ant dir="parse-rtf" target="test"/> --> <ant dir="parse-swf" target="test"/> <ant dir="parse-zip" target="test"/> @@ -115,6 +117,7 @@ <ant dir="analysis-fr" target="clean"/> <ant dir="clustering-carrot2" target="clean"/> <ant dir="creativecommons" target="clean"/> + <ant dir="feed" target="clean"/> <ant dir="index-basic" target="clean"/> <ant dir="index-more" target="clean"/> <ant dir="languageidentifier" target="clean"/> Added: lucene/nutch/trunk/src/plugin/feed/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/build.xml?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/feed/build.xml (added) +++ lucene/nutch/trunk/src/plugin/feed/build.xml Tue Jun 19 07:01:02 2007 @@ -0,0 +1,33 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<project name="feed" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" + dir="../nutch-extensionpoints" /> + <ant target="deploy" inheritall="false" dir="../protocol-file" /> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy file="sample/rsstest.rss" todir="${build.test}/data" /> +</project> Added: lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar?view=auto&rev=548730 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/feed/lib/jdom.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar?view=auto&rev=548730 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/feed/lib/rome-0.9.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/feed/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/plugin.xml?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/feed/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/feed/plugin.xml Tue Jun 19 07:01:02 2007 @@ -0,0 +1,48 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0" + provider-name="nutch.org"> + <runtime> + <library name="feed.jar"> + <export name="*" /> + </library> + <library name="rome-0.9.jar" /> + <library name="jdom.jar" /> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints" /> + </requires> + + <extension id="org.apache.nutch.parse.feed" name="Feed Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.feed.FeedParser" + class="org.apache.nutch.parse.feed.FeedParser"> + <parameter name="contentType" value="application/rss+xml" /> + <parameter name="contentType" value="application/atom+xml" /> + <parameter name="contentType" value="text/xml" /> + <parameter name="pathSuffix" value="rss" /> + </implementation> + </extension> + <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="FeedIndexingFilter" + class="org.apache.nutch.indexer.feed.FeedIndexingFilter" /> + </extension> +</plugin> Added: lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss (added) +++ lucene/nutch/trunk/src/plugin/feed/sample/rsstest.rss Tue Jun 19 07:01:02 2007 @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="ISO-8859-1" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<rss version="0.91"> + <channel> + <title>TestChannel</title> + <link>http://test.channel.com/</link> + <description>Sample RSS File for Junit test</description> + <language>en-us</language> + + <item> + <title>Home Page of Chris Mattmann</title> + <link>http://www-scf.usc.edu/~mattmann/</link> + <description>Chris Mattmann's home page</description> + </item> + <item> + <title>Awesome Open Source Search Engine</title> + <link>http://www.nutch.org/</link> + <description>Yup, that's what it is</description> + </item> + </channel> +</rss> Added: lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (added) +++ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java Tue Jun 19 07:01:02 2007 @@ -0,0 +1,140 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.feed; + +//JDK imports +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.TimeZone; + +//APACHE imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.metadata.Feed; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; + +/** + * @author dogacan + * @author mattmann + * @since NUTCH-444 + * + * An [EMAIL PROTECTED] IndexingFilter} implementation to pull out the + * relevant extracted [EMAIL PROTECTED] Metadata} fields from the RSS feeds + * and into the index. + * + */ +public class FeedIndexingFilter implements IndexingFilter { + + public static final String dateFormatStr = "yyyyMMddHHmm"; + + private Configuration conf; + + private final static String PUBLISHED_DATE = "publishedDate"; + + private final static String UPDATED_DATE = "updatedDate"; + + /** + * Extracts out the relevant fields: + * + * <ul> + * <li>FEED_AUTHOR</li> + * <li>FEED_TAGS</li> + * <li>FEED_PUBLISHED</li> + * <li>FEED_UPDATED</li> + * <li>FEED</li> + * </ul> + * + * And sends them to the [EMAIL PROTECTED] Indexer} for indexing within the Nutch + * index. + * + */ + public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, + Inlinks inlinks) throws IndexingException { + ParseData parseData = parse.getData(); + Metadata parseMeta = parseData.getParseMeta(); + + String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR); + String[] tags = parseMeta.getValues(Feed.FEED_TAGS); + String published = parseMeta.get(Feed.FEED_PUBLISHED); + String updated = parseMeta.get(Feed.FEED_UPDATED); + String feed = parseMeta.get(Feed.FEED); + + if (authors != null) { + for (String author : authors) { + doc.add(new Field(Feed.FEED_AUTHOR, author, + Field.Store.YES, Field.Index.TOKENIZED)); + } + } + + if (tags != null) { + for (String tag : tags) { + doc.add(new Field(Feed.FEED_TAGS, tag, + Field.Store.YES, Field.Index.TOKENIZED)); + } + } + + if (feed != null) + doc.add(new Field(Feed.FEED, feed, Field.Store.YES, Field.Index.TOKENIZED)); + + SimpleDateFormat sdf = new SimpleDateFormat(dateFormatStr); + sdf.setTimeZone(TimeZone.getTimeZone("GMT")); + if (published != null) { + Date date = new Date(Long.parseLong(published)); + String dateString = sdf.format(date); + doc.add(new Field(PUBLISHED_DATE, dateString, + Field.Store.YES, Field.Index.NO_NORMS)); + } + + if (updated != null) { + Date date = new Date(Long.parseLong(updated)); + String dateString = sdf.format(date); + doc.add(new Field(UPDATED_DATE, dateString, + Field.Store.YES, Field.Index.NO_NORMS)); + } + + return doc; + } + + /** + * @return the [EMAIL PROTECTED] Configuration} object used to configure + * this [EMAIL PROTECTED] IndexingFilter}. + */ + public Configuration getConf() { + return conf; + } + + /** + * Sets the [EMAIL PROTECTED] Configuration} object used to configure this + * [EMAIL PROTECTED] IndexingFilter}. + * + * @param conf The [EMAIL PROTECTED] Configuration} object used to configure + * this [EMAIL PROTECTED] IndexingFilter}. + */ + public void setConf(Configuration conf) { + this.conf = conf; + } + +} Added: lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java (added) +++ lucene/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java Tue Jun 19 07:01:02 2007 @@ -0,0 +1,364 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.feed; + +// JDK imports +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; + +// APACHE imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.metadata.Feed; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParserNotFound; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.xml.sax.InputSource; + +// ROME imports +import com.sun.syndication.feed.synd.SyndCategory; +import com.sun.syndication.feed.synd.SyndContent; +import com.sun.syndication.feed.synd.SyndEntry; +import com.sun.syndication.feed.synd.SyndFeed; +import com.sun.syndication.feed.synd.SyndPerson; +import com.sun.syndication.io.SyndFeedInput; + +/** + * + * @author dogacan + * @author mattmann + * @since NUTCH-444 + * + * <p> + * A new RSS/ATOM [EMAIL PROTECTED] Parser} that rapidly parses all referenced links + * and content present in the feed. + * </p> + * + */ +public class FeedParser implements Parser { + + public static final String CHARSET_UTF8 = "charset=UTF-8"; + + public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; " + + CHARSET_UTF8; + + public static final Log LOG = LogFactory + .getLog("org.apache.nutch.parse.feed"); + + private Configuration conf; + + private ParserFactory parserFactory; + + private URLNormalizers normalizers; + + private URLFilters filters; + + /** + * Parses the given feed and extracts out and parsers all linked items within + * the feed, using the underlying ROME feed parsing library. + * + * @param content + * A [EMAIL PROTECTED] Content} object representing the feed that is being + * parsed by this [EMAIL PROTECTED] Parser}. + * + * @return A [EMAIL PROTECTED] ParseResult} containing all [EMAIL PROTECTED] Parse}d feeds that + * were present in the feed file that this [EMAIL PROTECTED] Parser} dealt with. + * + */ + public ParseResult getParse(Content content) { + SyndFeed feed = null; + ParseResult parseResult = new ParseResult(content.getUrl()); + try { + InputSource input = new InputSource(new ByteArrayInputStream(content + .getContent())); + SyndFeedInput feedInput = new SyndFeedInput(); + feed = feedInput.build(input); + } catch (Exception e) { + // return empty parse + LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + + StringUtils.stringifyException(e)); + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + List entries = feed.getEntries(); + String feedLink = feed.getLink(); + try { + feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK); + if (feedLink != null) + feedLink = filters.filter(feedLink); + } catch (Exception e) { + feedLink = null; + } + + for (Iterator i = entries.iterator(); i.hasNext();) { + SyndEntry entry = (SyndEntry) i.next(); + addToMap(parseResult, feed, feedLink, entry, content); + } + + String feedDesc = stripTags(feed.getDescriptionEx()); + String feedTitle = stripTags(feed.getTitleEx()); + + parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData( + new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], + content.getMetadata())); + + return parseResult; + } + + /** + * + * Sets the [EMAIL PROTECTED] Configuration} object for this [EMAIL PROTECTED] Parser}. This + * [EMAIL PROTECTED] Parser} expects the following configuration properties to be set: + * + * <ul> + * <li>URLNormalizers - properties in the configuration object to set up the + * default url normalizers.</li> + * <li>URLFilters - properties in the configuration object to set up the + * default url filters.</li> + * </ul> + * + * @param conf + * The Hadoop [EMAIL PROTECTED] Configuration} object to use to configure this + * [EMAIL PROTECTED] Parser}. + * + */ + public void setConf(Configuration conf) { + this.conf = conf; + this.parserFactory = new ParserFactory(conf); + this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK); + this.filters = new URLFilters(conf); + } + + /** + * + * @return The [EMAIL PROTECTED] Configuration} object used to configure this + * [EMAIL PROTECTED] Parser}. + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Runs a command line version of this [EMAIL PROTECTED] Parser}. + * + * @param args + * A single argument (expected at arg[0]) representing a path on the + * local filesystem that points to a feed file. + * + * @throws Exception + * If any error occurs. + */ + public static void main(String[] args) throws Exception { + if (args.length != 1) { + System.err.println("Usage: FeedParser <feed>"); + System.exit(1); + } + String name = args[0]; + String url = "file:" + name; + Configuration conf = NutchConfiguration.create(); + FeedParser parser = new FeedParser(); + parser.setConf(conf); + File file = new File(name); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + ParseResult parseResult = parser.getParse(new Content(url, url, bytes, + "application/rss+xml", new Metadata(), conf)); + for (Entry<Text, Parse> entry : parseResult) { + System.out.println("key: " + entry.getKey()); + Parse parse = entry.getValue(); + System.out.println("data: " + parse.getData()); + System.out.println("text: " + parse.getText() + "\n"); + } + } + + private void addToMap(ParseResult parseResult, SyndFeed feed, + String feedLink, SyndEntry entry, Content content) { + String link = entry.getLink(), text = null, title = null; + Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata(); + Parse parse = null; + SyndContent description = entry.getDescription(); + + try { + link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK); + if (link != null) + link = filters.filter(link); + } catch (Exception e) { + return; + } + + if (link == null) + return; + + title = stripTags(entry.getTitleEx()); + + if (feedLink != null) + parseMeta.set("feed", feedLink); + + addFields(parseMeta, contentMeta, feed, entry); + + // some item descriptions contain markup text in them, + // so we temporarily set their content-type to parse them + // with another plugin + String contentType = contentMeta.get(Response.CONTENT_TYPE); + + if (description != null) + text = description.getValue(); + + if (text == null) { + List contents = entry.getContents(); + StringBuilder buf = new StringBuilder(); + for (Iterator i = contents.iterator(); i.hasNext();) { + SyndContent syndContent = (SyndContent) i.next(); + buf.append(syndContent.getValue()); + } + text = buf.toString(); + } + + try { + Parser parser = parserFactory.getParsers(contentType, link)[0]; + parse = parser.getParse( + new Content(link, link, text.getBytes(), contentType, contentMeta, + conf)).get(link); + } catch (ParserNotFound e) { /* ignore */ + } + + if (parse != null) { + ParseData data = parse.getData(); + data.getContentMeta().remove(Response.CONTENT_TYPE); + mergeMetadata(data.getParseMeta(), parseMeta); + parseResult.put(link, new ParseText(parse.getText()), new ParseData( + ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data + .getContentMeta(), data.getParseMeta())); + } else { + contentMeta.remove(Response.CONTENT_TYPE); + parseResult.put(link, new ParseText(text), new ParseData( + ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, + parseMeta)); + } + + } + + private static String stripTags(SyndContent c) { + if (c == null) + return ""; + + String value = c.getValue(); + + String[] parts = value.split("<[^>]*>"); + StringBuffer buf = new StringBuffer(); + + for (String part : parts) + buf.append(part); + + return buf.toString().trim(); + } + + private void addFields(Metadata parseMeta, Metadata contentMeta, + SyndFeed feed, SyndEntry entry) { + List authors = entry.getAuthors(), categories = entry.getCategories(); + Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate(); + String contentType = null; + + if (authors != null) { + for (Object o : authors) { + SyndPerson author = (SyndPerson) o; + String authorName = author.getName(); + if (checkString(authorName)) { + parseMeta.add(Feed.FEED_AUTHOR, authorName); + } + } + } else { + // getAuthors may return null if feed is non-atom + // if so, call getAuthor to get Dublin Core module creator. + String authorName = entry.getAuthor(); + if (checkString(authorName)) { + parseMeta.set(Feed.FEED_AUTHOR, authorName); + } + } + + for (Iterator i = categories.iterator(); i.hasNext();) { + parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i.next()).getName()); + } + + if (published != null) { + parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime())); + } + if (updated != null) { + parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime())); + } + + SyndContent description = entry.getDescription(); + if (description != null) { + contentType = description.getType(); + } else { + // TODO: What to do if contents.size() > 1? + List contents = entry.getContents(); + if (contents.size() > 0) { + contentType = ((SyndContent) contents.get(0)).getType(); + } + } + + if (checkString(contentType)) { + // ROME may return content-type as html + if (contentType.equals("html")) + contentType = "text/html"; + else if (contentType.equals("xhtml")) + contentType = "text/xhtml"; + contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8); + } else { + contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE); + } + + } + + private void mergeMetadata(Metadata first, Metadata second) { + for (String name : second.names()) { + String[] values = second.getValues(name); + for (String value : values) { + first.add(name, value); + } + } + } + + private boolean checkString(String s) { + return s != null && !s.equals(""); + } + +} Added: lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?view=auto&rev=548730 ============================================================================== --- lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java (added) +++ lucene/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Tue Jun 19 07:01:02 2007 @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.feed; + +// JDK imports +import java.util.Iterator; +import java.util.Map; + +// APACHE imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolNotFound; +import org.apache.nutch.util.NutchConfiguration; + +// Junit imports +import junit.framework.TestCase; + +/** + * + * @author mattmann + * + * Test Suite for the [EMAIL PROTECTED] FeedParser}. + * + */ +public class TestFeedParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/feed/build.xml during plugin compilation. + + private String[] sampleFiles = { "rsstest.rss" }; + + public static final Log LOG = LogFactory.getLog(TestFeedParser.class + .getName()); + + /** + * Default Constructor. + * + * @param name + * The name of this [EMAIL PROTECTED] TestCase}. + */ + public TestFeedParser(String name) { + super(name); + } + + /** + * Calls the [EMAIL PROTECTED] FeedParser} on a sample RSS file and checks that there are + * 3 [EMAIL PROTECTED] ParseResult} entries including the below 2 links: + * <ul> + * <li>http://www-scf.usc.edu/~mattmann/</li> + * <li>http://www.nutch.org</li> + * </ul> + * + * + * @throws ProtocolNotFound + * If the [EMAIL PROTECTED] Protocol}Layer cannot be loaded (required to fetch + * the [EMAIL PROTECTED] Content} for the RSS file). + * @throws ParseException + * If the [EMAIL PROTECTED] Parser}Layer cannot be loaded. + */ + public void testParseFetchChannel() throws ProtocolNotFound, ParseException { + String urlString; + Protocol protocol; + Content content; + ParseResult parseResult; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parseResult = new ParseUtil(conf).parseByExtensionId("feed", content); + + assertEquals(3, parseResult.size()); + + boolean hasLink1 = false, hasLink2 = false, hasLink3=false; + + for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j + .hasNext();) { + Map.Entry<Text, Parse> entry = j.next(); + if (entry.getKey().toString().equals( + "http://www-scf.usc.edu/~mattmann/")) { + hasLink1 = true; + } else if (entry.getKey().toString().equals("http://www.nutch.org/")) { + hasLink2 = true; + } + else if(entry.getKey().toString().equals(urlString)){ + hasLink3 = true; + } + + assertNotNull(entry.getValue()); + assertNotNull(entry.getValue().getData()); + } + + if (!hasLink1 || !hasLink2 || !hasLink3) { + fail("Outlinks read from sample rss file are not correct!"); + } + } + + } + +} ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs