Author: mattmann
Date: Fri Jul 16 17:52:03 2010
New Revision: 964885
URL: http://svn.apache.org/viewvc?rev=964885&view=rev
Log:
- fix for TIKA-466 Feed Parser contributed by jnioche
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
(with props)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=964885&r1=964884&r2=964885&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Jul 16 17:52:03 2010
@@ -4,6 +4,9 @@ Release 0.8 - Current Development
The most notable changes in Tika 0.8 over previous releases are:
+ * Tika now supports parsing Feeds by wrapping the underlying
+ Rome library. (TIKA-466)
+
* A quick-start guide for Tika parsing was contributed. (TIKA-464)
* An approach for plumbing through XHTML attributes was added. (TIKA-379)
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=964885&r1=964884&r2=964885&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Fri Jul 16 17:52:03 2010
@@ -138,6 +138,11 @@
<version>1.0.4</version>
</dependency>
<dependency>
+ <groupId>rome</groupId>
+ <artifactId>rome</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=964885&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
Fri Jul 16 17:52:03 2010
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * Feed parser.
+ * <p>
+ * Uses Rome for parsing the feeds. A feed description is put in a paragraph
+ * with its link and title in an anchor.
+ */
+public class FeedParser implements Parser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("rss+xml"),
+ MediaType.application("atom+xml"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException
{
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ SyndFeed feed = null;
+ // set the encoding?
+ try {
+ SyndFeedInput feedInput = new SyndFeedInput();
+ InputSource input = new InputSource(stream);
+ feed = feedInput.build(input);
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage());
+ }
+
+ String feedLink = feed.getLink();
+ String feedDesc = stripTags(feed.getDescriptionEx());
+ String feedTitle = stripTags(feed.getTitleEx());
+
+ metadata.set(Metadata.TITLE, feedTitle);
+ metadata.set(Metadata.DESCRIPTION, feedDesc);
+ // store the other fields in the metadata
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ List entries = feed.getEntries();
+ for (Iterator i = entries.iterator(); i.hasNext();) {
+ SyndEntry entry = (SyndEntry) i.next();
+ String link = entry.getLink();
+ if (link == null)
+ continue;
+ SyndContent description = entry.getDescription();
+
+ String title = stripTags(entry.getTitleEx());
+ xhtml.startElement("a", "href", link);
+ xhtml.characters(title);
+ xhtml.endElement("a");
+ xhtml.startElement("p");
+ if (description != null)
+ xhtml.characters(description.getValue());
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+
+ private static String stripTags(SyndContent c) {
+ if (c == null)
+ return "";
+
+ String value = c.getValue();
+
+ String[] parts = value.split("<[^>]*>");
+ StringBuffer buf = new StringBuffer();
+
+ for (String part : parts)
+ buf.append(part);
+
+ return buf.toString().trim();
+ }
+}
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=964885&r1=964884&r2=964885&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
(original)
+++
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Fri Jul 16 17:52:03 2010
@@ -18,6 +18,7 @@ org.apache.tika.parser.audio.AudioParser
org.apache.tika.parser.audio.MidiParser
org.apache.tika.parser.dwg.DWGParser
org.apache.tika.parser.epub.EpubParser
+org.apache.tika.parser.feed.FeedParser
org.apache.tika.parser.html.HtmlParser
org.apache.tika.parser.image.ImageParser
org.apache.tika.parser.image.TiffParser
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java?rev=964885&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
(added)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
Fri Jul 16 17:52:03 2010
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest extends TestCase {
+
+ public void testXMLParser() throws Exception {
+ InputStream input = FeedParserTest.class
+ .getResourceAsStream("/test-documents/rsstest.rss");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample RSS File for Junit test",
+ metadata.get(Metadata.DESCRIPTION));
+ assertEquals("TestChannel", metadata.get(Metadata.TITLE));
+
+ // TODO find a way of testing the paragraphs and anchors
+
+ } finally {
+ input.close();
+ }
+ }
+
+}
Propchange:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss?rev=964885&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss
(added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss Fri
Jul 16 17:52:03 2010
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<rss version="0.91">
+ <channel>
+ <title>TestChannel</title>
+ <link>http://test.channel.com/</link>
+ <description>Sample RSS File for Junit test</description>
+ <language>en-us</language>
+
+ <item>
+ <title>Home Page of Chris Mattmann</title>
+ <link>http://www-scf.usc.edu/~mattmann/</link>
+ <description>Chris Mattmann's home page</description>
+ </item>
+ <item>
+ <title>Awesome Open Source Search Engine</title>
+ <link>http://www.nutch.org/</link>
+ <description>Yup, that's what it is</description>
+ </item>
+ </channel>
+</rss>