Awww, sniff....bye parse-rss! On May 4, 2011, at 11:20 AM, <[email protected]> <[email protected]> wrote:
> Author: jnioche > Date: Wed May 4 15:20:00 2011 > New Revision: 1099483 > > URL: http://svn.apache.org/viewvc?rev=1099483&view=rev > Log: > NUTCH-888 : Remove parse-rss > > Added: > nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss > > nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java > Removed: > nutch/branches/branch-1.3/src/plugin/parse-rss/ > Modified: > nutch/branches/branch-1.3/CHANGES.txt > nutch/branches/branch-1.3/conf/parse-plugins.xml > nutch/branches/branch-1.3/src/plugin/build.xml > nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml > > Modified: nutch/branches/branch-1.3/CHANGES.txt > URL: > http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1099483&r1=1099482&r2=1099483&view=diff > ============================================================================== > --- nutch/branches/branch-1.3/CHANGES.txt (original) > +++ nutch/branches/branch-1.3/CHANGES.txt Wed May 4 15:20:00 2011 > @@ -2,6 +2,8 @@ Nutch Change Log > > Release 1.3 - 4/21/2011 > > +* NUTCH-888 Remove parse-rss and add tests for rss to parse-tika (jnioche) > + > * NUTCH-991 SolrDedup must issue a commit (markus) > > * NUTCH 986 SolrDedup fails due to date incorrect format (markus) > > Modified: nutch/branches/branch-1.3/conf/parse-plugins.xml > URL: > http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/parse-plugins.xml?rev=1099483&r1=1099482&r2=1099483&view=diff > ============================================================================== > --- nutch/branches/branch-1.3/conf/parse-plugins.xml (original) > +++ nutch/branches/branch-1.3/conf/parse-plugins.xml Wed May 4 15:20:00 2011 > @@ -27,9 +27,9 @@ > <mimeType name="*"> > <plugin id="parse-tika" /> > </mimeType> > - > + > <mimeType name="application/rss+xml"> > - <plugin id="parse-rss" /> > + <plugin id="parse-tika" /> > <plugin id="feed" /> > </mimeType> > > @@ -65,7 +65,6 @@ > > <mimeType name="text/xml"> > <plugin id="parse-tika" /> > - <plugin id="parse-rss" /> > <plugin id="feed" /> > </mimeType> > > @@ -88,8 +87,6 @@ > <alias name="parse-html" > extension-id="org.apache.nutch.parse.html.HtmlParser" /> > <alias name="parse-js" extension-id="JSParser" /> > - <alias name="parse-rss" > - extension-id="org.apache.nutch.parse.rss.RSSParser" /> > <alias name="feed" > extension-id="org.apache.nutch.parse.feed.FeedParser" /> > <alias name="parse-swf" > > Modified: nutch/branches/branch-1.3/src/plugin/build.xml > URL: > http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/build.xml?rev=1099483&r1=1099482&r2=1099483&view=diff > ============================================================================== > --- nutch/branches/branch-1.3/src/plugin/build.xml (original) > +++ nutch/branches/branch-1.3/src/plugin/build.xml Wed May 4 15:20:00 2011 > @@ -45,7 +45,6 @@ > <ant dir="parse-ext" target="deploy"/> > <ant dir="parse-js" target="deploy"/> > <ant dir="parse-html" target="deploy"/> > - <ant dir="parse-rss" target="deploy"/> > <ant dir="parse-swf" target="deploy"/> > <ant dir="parse-tika" target="deploy"/> > <ant dir="parse-zip" target="deploy"/> > @@ -77,7 +76,6 @@ > <ant dir="protocol-file" target="test"/> > <ant dir="protocol-httpclient" target="test"/> > <!--ant dir="parse-ext" target="test"/--> > - <ant dir="parse-rss" target="test"/> > <ant dir="feed" target="test"/> > <ant dir="parse-html" target="test"/> > <ant dir="parse-swf" target="test"/> > @@ -119,7 +117,6 @@ > <ant dir="parse-ext" target="clean"/> > <ant dir="parse-js" target="clean"/> > <ant dir="parse-html" target="clean"/> > - <ant dir="parse-rss" target="clean"/> > <ant dir="parse-swf" target="clean"/> > <ant dir="parse-tika" target="clean"/> > <ant dir="parse-zip" target="clean"/> > > Modified: nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml > URL: > http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml?rev=1099483&r1=1099482&r2=1099483&view=diff > ============================================================================== > --- nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml (original) > +++ nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml Wed May 4 > 15:20:00 2011 > @@ -29,6 +29,7 @@ > <mkdir dir="${build.test}/data"/> > <copy todir="${build.test}/data"> > <fileset dir="sample"> > + <include name="*.rss"/> > <include name="*.rtf"/> > <include name="*.pdf"/> > <include name="ootest.*"/> > > Added: nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss > URL: > http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss?rev=1099483&view=auto > ============================================================================== > --- nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss (added) > +++ nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss Wed > May 4 15:20:00 2011 > @@ -0,0 +1,37 @@ > +<?xml version="1.0" encoding="ISO-8859-1" ?> > +<!-- > + Licensed to the Apache Software Foundation (ASF) under one or more > + contributor license agreements. See the NOTICE file distributed with > + this work for additional information regarding copyright ownership. > + The ASF licenses this file to You under the Apache License, Version 2.0 > + (the "License"); you may not use this file except in compliance with > + the License. You may obtain a copy of the License at > + > + http://www.apache.org/licenses/LICENSE-2.0 > + > + Unless required by applicable law or agreed to in writing, software > + distributed under the License is distributed on an "AS IS" BASIS, > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + See the License for the specific language governing permissions and > + limitations under the License. > +--> > +<rss version="0.91"> > + <channel> > + <title>TestChannel</title> > + <link>http://test.channel.com/</link> > + <description>Sample RSS File for Junit test</description> > + <language>en-us</language> > + > + <item> > + <title>Home Page of Chris Mattmann</title> > + <link>http://www-scf.usc.edu/~mattmann/</link> > + <description>Chris Mattmann's home page</description> > + </item> > + > + <item> > + <title>Awesome Open Source Search Engine</title> > + <link>http://www.nutch.org/</link> > + <description>Yup, that's what it is</description> > + </item> > + </channel> > +</rss> > > Added: > nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java > URL: > http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java?rev=1099483&view=auto > ============================================================================== > --- > nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java > (added) > +++ > nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java > Wed May 4 15:20:00 2011 > @@ -0,0 +1,130 @@ > +/** > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +package org.apache.nutch.tika; > + > +import junit.framework.TestCase; > + > +import org.apache.commons.logging.Log; > +import org.apache.commons.logging.LogFactory; > +import org.apache.hadoop.conf.Configuration; > +import org.apache.hadoop.io.Text; > +import org.apache.nutch.crawl.CrawlDatum; > +import org.apache.nutch.parse.Outlink; > +import org.apache.nutch.parse.Parse; > +import org.apache.nutch.parse.ParseData; > +import org.apache.nutch.parse.ParseException; > +import org.apache.nutch.parse.ParseUtil; > +import org.apache.nutch.parse.tika.TikaParser; > +import org.apache.nutch.protocol.Content; > +import org.apache.nutch.protocol.Protocol; > +import org.apache.nutch.protocol.ProtocolException; > +import org.apache.nutch.protocol.ProtocolFactory; > +import org.apache.nutch.util.NutchConfiguration; > + > +/** > + * > + * @author mattmann / jnioche > + * > + * Test Suite for the RSS feeds with the {@link TikaParser}. > + * > + */ > +public class TestFeedParser extends TestCase { > + > + private String fileSeparator = System.getProperty("file.separator"); > + > + // This system property is defined in ./src/plugin/build-plugin.xml > + private String sampleDir = System.getProperty("test.data", "."); > + > + private String[] sampleFiles = { "rsstest.rss" }; > + > + public static final Log LOG = LogFactory.getLog(TestFeedParser.class > + .getName()); > + > + /** > + * Default Constructor. > + * > + * @param name > + * The name of this {@link TestCase}. > + */ > + public TestFeedParser(String name) { > + super(name); > + } > + > + /** > + * <p> > + * The test method: tests out the following 2 asserts: > + * </p> > + * > + * <ul> > + * <li>There are 3 outlinks read from the sample rss file</li> > + * <li>The 3 outlinks read are in fact the correct outlinks from the > sample > + * file</li> > + * </ul> > + */ > + public void testIt() throws ProtocolException, ParseException { > + String urlString; > + Protocol protocol; > + Content content; > + Parse parse; > + > + Configuration conf = NutchConfiguration.create(); > + for (int i = 0; i < sampleFiles.length; i++) { > + urlString = "file:" + sampleDir + fileSeparator + > sampleFiles[i]; > + > + protocol = new > ProtocolFactory(conf).getProtocol(urlString); > + content = protocol.getProtocolOutput(new > Text(urlString), > + new CrawlDatum()).getContent(); > + parse = new > ParseUtil(conf).parseByExtensionId("parse-tika", > + content).get(content.getUrl()); > + > + // check that there are 2 outlinks: > + // unlike the original parse-rss > + // tika ignores the URL and description of the channel > + > + // http://test.channel.com > + // http://www-scf.usc.edu/~mattmann/ > + // http://www.nutch.org > + > + ParseData theParseData = parse.getData(); > + > + Outlink[] theOutlinks = theParseData.getOutlinks(); > + > + assertTrue("There aren't 2 outlinks read!", > + theOutlinks.length == 2); > + > + // now check to make sure that those are the two > outlinks > + boolean hasLink1 = false, hasLink2 = false; > + > + for (int j = 0; j < theOutlinks.length; j++) { > + if (theOutlinks[j].getToUrl().equals( > + > "http://www-scf.usc.edu/~mattmann/")) { > + hasLink1 = true; > + } > + > + if > (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) { > + hasLink2 = true; > + } > + } > + > + if (!hasLink1 || !hasLink2) { > + fail("Outlinks read from sample rss file are > not correct!"); > + } > + } > + } > + > +} > > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Chris Mattmann, Ph.D. Senior Computer Scientist NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA Office: 171-266B, Mailstop: 171-246 Email: [email protected] WWW: http://sunset.usc.edu/~mattmann/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Adjunct Assistant Professor, Computer Science Department University of Southern California, Los Angeles, CA 90089 USA ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

