Author: mattmann
Date: Fri Jul 16 17:52:03 2010
New Revision: 964885

URL: http://svn.apache.org/viewvc?rev=964885&view=rev
Log:
- fix for TIKA-466 Feed Parser contributed by jnioche

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
   (with props)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/pom.xml
    
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=964885&r1=964884&r2=964885&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Jul 16 17:52:03 2010
@@ -4,6 +4,9 @@ Release 0.8 - Current Development
 
 The most notable changes in Tika 0.8 over previous releases are:
 
+ * Tika now supports parsing Feeds by wrapping the underlying
+   Rome library. (TIKA-466)
+
  * A quick-start guide for Tika parsing was contributed. (TIKA-464)
 
  * An approach for plumbing through XHTML attributes was added. (TIKA-379)

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=964885&r1=964884&r2=964885&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Fri Jul 16 17:52:03 2010
@@ -138,6 +138,11 @@
       <version>1.0.4</version>
     </dependency>
     <dependency>
+      <groupId>rome</groupId>
+      <artifactId>rome</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <scope>test</scope>

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=964885&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
 Fri Jul 16 17:52:03 2010
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * Feed parser.
+ * <p>
+ * Uses Rome for parsing the feeds. A feed description is put in a paragraph
+ * with its link and title in an anchor.
+ */
+public class FeedParser implements Parser {
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("rss+xml"),
+                    MediaType.application("atom+xml"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException 
{
+        parse(stream, handler, metadata, new ParseContext());
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        SyndFeed feed = null;
+        // set the encoding?
+        try {
+            SyndFeedInput feedInput = new SyndFeedInput();
+            InputSource input = new InputSource(stream);
+            feed = feedInput.build(input);
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage());
+        }
+
+        String feedLink = feed.getLink();
+        String feedDesc = stripTags(feed.getDescriptionEx());
+        String feedTitle = stripTags(feed.getTitleEx());
+
+        metadata.set(Metadata.TITLE, feedTitle);
+        metadata.set(Metadata.DESCRIPTION, feedDesc);
+        // store the other fields in the metadata
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        List entries = feed.getEntries();
+        for (Iterator i = entries.iterator(); i.hasNext();) {
+            SyndEntry entry = (SyndEntry) i.next();
+            String link = entry.getLink();
+            if (link == null)
+                continue;
+            SyndContent description = entry.getDescription();
+
+            String title = stripTags(entry.getTitleEx());
+            xhtml.startElement("a", "href", link);
+            xhtml.characters(title);
+            xhtml.endElement("a");
+            xhtml.startElement("p");
+            if (description != null)
+                xhtml.characters(description.getValue());
+            xhtml.endElement("p");
+        }
+
+        xhtml.endDocument();
+    }
+
+    private static String stripTags(SyndContent c) {
+        if (c == null)
+            return "";
+
+        String value = c.getValue();
+
+        String[] parts = value.split("<[^>]*>");
+        StringBuffer buf = new StringBuffer();
+
+        for (String part : parts)
+            buf.append(part);
+
+        return buf.toString().trim();
+    }
+}

Propchange: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=964885&r1=964884&r2=964885&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Fri Jul 16 17:52:03 2010
@@ -18,6 +18,7 @@ org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
 org.apache.tika.parser.dwg.DWGParser
 org.apache.tika.parser.epub.EpubParser
+org.apache.tika.parser.feed.FeedParser
 org.apache.tika.parser.html.HtmlParser
 org.apache.tika.parser.image.ImageParser
 org.apache.tika.parser.image.TiffParser

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java?rev=964885&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
 Fri Jul 16 17:52:03 2010
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest extends TestCase {
+
+    public void testXMLParser() throws Exception {
+        InputStream input = FeedParserTest.class
+                .getResourceAsStream("/test-documents/rsstest.rss");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+
+            new FeedParser().parse(input, handler, metadata, context);
+
+            String content = handler.toString();
+            assertFalse(content == null);
+
+            assertEquals("Sample RSS File for Junit test",
+                    metadata.get(Metadata.DESCRIPTION));
+            assertEquals("TestChannel", metadata.get(Metadata.TITLE));
+
+            // TODO find a way of testing the paragraphs and anchors
+
+        } finally {
+            input.close();
+        }
+    }
+
+}

Propchange: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss?rev=964885&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss 
(added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/rsstest.rss Fri 
Jul 16 17:52:03 2010
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+       Licensed to the Apache Software Foundation (ASF) under one or more
+       contributor license agreements.  See the NOTICE file distributed with
+       this work for additional information regarding copyright ownership.
+       The ASF licenses this file to You under the Apache License, Version 2.0
+       (the "License"); you may not use this file except in compliance with
+       the License.  You may obtain a copy of the License at
+       
+       http://www.apache.org/licenses/LICENSE-2.0
+       
+       Unless required by applicable law or agreed to in writing, software
+       distributed under the License is distributed on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+       See the License for the specific language governing permissions and
+       limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>


Reply via email to