Author: mattmann
Date: Fri Jul 16 17:39:07 2010
New Revision: 964869

URL: http://svn.apache.org/viewvc?rev=964869&view=rev
Log:
- fix for TIKA-464 Contribute a "get Tika parsing up and running in 5 minutes" 
quick start guide

Added:
    tika/site/src/site/apt/0.7/parser_guide.apt
    tika/trunk/src/site/apt/parser_guide.apt
Modified:
    tika/site/src/site/site.xml
    tika/trunk/CHANGES.txt
    tika/trunk/src/site/site.xml

Added: tika/site/src/site/apt/0.7/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=964869&view=auto
==============================================================================
--- tika/site/src/site/apt/0.7/parser_guide.apt (added)
+++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jul 16 17:39:07 2010
@@ -0,0 +1,135 @@
+                       --------------------------------------------
+                       Get Tika parsing up and running in 5 minutes
+                       --------------------------------------------
+                                          Arturo Beltran
+                                          
--------------------------------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements.  See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License.  You may obtain a copy of the License at
+~~
+~~     http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Get Tika parsing up and running in 5 minutes
+
+   This page is a quick start guide showing how to add a new parser to Apache 
Tika.
+   Following the simple steps listed below your new parser can be running in 
only 5 minutes.
+
+%{toc|section=1|fromDepth=1}
+
+* {Getting Started}
+
+   The {{{gettingstarted.html}Getting Started}} document describes how to 
+   build Apache Tika from sources and how to start using Tika in an 
application. Pay close attention 
+   and follow the instructions in the "Getting and building the sources" 
section.
+   
+
+* {Add your MIME-Type}
+
+   You first need to modify 
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
+   in order to Tika can map the file extension with its MIME-Type. You should 
add something like this:
+   
+---
+ <mime-type type="application/hello">
+       <glob pattern="*.hi"/>
+ </mime-type>
+---
+
+* {Create your Parser class}
+
+   Now, you need to create your new parser. This is a class that must 
implement the Parser interface 
+   offered by Tika. A very simple Tika Parser looks like this:
+   
+---
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser implements Parser {
+
+       private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("hello"));
+       public static final String HELLO_MIME_TYPE = "application/hello";
+       
+       public Set<MediaType> getSupportedTypes(ParseContext context) {
+               return SUPPORTED_TYPES;
+       }
+
+       public void parse(
+                       InputStream stream, ContentHandler handler,
+                       Metadata metadata, ParseContext context)
+                       throws IOException, SAXException, TikaException {
+
+               metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+               metadata.set("Hello", "World");
+
+               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+               xhtml.startDocument();
+               xhtml.endDocument();
+       }
+
+       /**
+        * @deprecated This method will be removed in Apache Tika 1.0.
+        */
+       public void parse(
+                       InputStream stream, ContentHandler handler, Metadata 
metadata)
+                       throws IOException, SAXException, TikaException {
+               parse(stream, handler, metadata, new ParseContext());
+       }
+}
+---
+   
+   Pay special attention to the definition of the SUPPORTED_TYPES static class 
+   field in the parser class that defines what MIME-Types it supports. 
+   
+   Is in the "parse" method where you will do all your work. This is, extract 
+   the information of the resource and then set the metadata.
+
+* {List the new parser}
+
+   Finally, you should explicitly tell the AutoDetectParser to include your 
new 
+   parser. This step is only needed if you want to use the AutoDetectParser 
functionality. 
+   If you figure out the correct parser in a different way, it isn't needed. 
+   
+   List your new parser in:
+    
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+   
+

Modified: tika/site/src/site/site.xml
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/site.xml?rev=964869&r1=964868&r2=964869&view=diff
==============================================================================
--- tika/site/src/site/site.xml (original)
+++ tika/site/src/site/site.xml Fri Jul 16 17:39:07 2010
@@ -44,6 +44,7 @@
         <item name="Getting Started" href="0.7/gettingstarted.html"/>
         <item name="Supported Formats" href="0.7/formats.html"/>
         <item name="Parser API" href="0.7/parser.html"/>
+        <item name="Parser 5min Quick Start Guide" href="parser_guide.html"/>  
      
         <item name="API Documentation" href="0.7/api/"/>
       </item>
       <item name="Tika 0.6" href="0.6/index.html">

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=964869&r1=964868&r2=964869&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Jul 16 17:39:07 2010
@@ -4,6 +4,8 @@ Release 0.8 - Current Development
 
 The most notable changes in Tika 0.8 over previous releases are:
 
+ * A quick-start guide for Tika parsing was contributed. (TIKA-464)
+
  * An approach for plumbing through XHTML attributes was added. (TIKA-379)
 
  * Media type hierarchy information is now taken into account when

Added: tika/trunk/src/site/apt/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/trunk/src/site/apt/parser_guide.apt?rev=964869&view=auto
==============================================================================
--- tika/trunk/src/site/apt/parser_guide.apt (added)
+++ tika/trunk/src/site/apt/parser_guide.apt Fri Jul 16 17:39:07 2010
@@ -0,0 +1,135 @@
+                       --------------------------------------------
+                       Get Tika parsing up and running in 5 minutes
+                       --------------------------------------------
+                                          Arturo Beltran
+                                          
--------------------------------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements.  See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License.  You may obtain a copy of the License at
+~~
+~~     http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Get Tika parsing up and running in 5 minutes
+
+   This page is a quick start guide showing how to add a new parser to Apache 
Tika.
+   Following the simple steps listed below your new parser can be running in 
only 5 minutes.
+
+%{toc|section=1|fromDepth=1}
+
+* {Getting Started}
+
+   The {{{gettingstarted.html}Getting Started}} document describes how to 
+   build Apache Tika from sources and how to start using Tika in an 
application. Pay close attention 
+   and follow the instructions in the "Getting and building the sources" 
section.
+   
+
+* {Add your MIME-Type}
+
+   You first need to modify 
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
+   in order to Tika can map the file extension with its MIME-Type. You should 
add something like this:
+   
+---
+ <mime-type type="application/hello">
+       <glob pattern="*.hi"/>
+ </mime-type>
+---
+
+* {Create your Parser class}
+
+   Now, you need to create your new parser. This is a class that must 
implement the Parser interface 
+   offered by Tika. A very simple Tika Parser looks like this:
+   
+---
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser implements Parser {
+
+       private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("hello"));
+       public static final String HELLO_MIME_TYPE = "application/hello";
+       
+       public Set<MediaType> getSupportedTypes(ParseContext context) {
+               return SUPPORTED_TYPES;
+       }
+
+       public void parse(
+                       InputStream stream, ContentHandler handler,
+                       Metadata metadata, ParseContext context)
+                       throws IOException, SAXException, TikaException {
+
+               metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+               metadata.set("Hello", "World");
+
+               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+               xhtml.startDocument();
+               xhtml.endDocument();
+       }
+
+       /**
+        * @deprecated This method will be removed in Apache Tika 1.0.
+        */
+       public void parse(
+                       InputStream stream, ContentHandler handler, Metadata 
metadata)
+                       throws IOException, SAXException, TikaException {
+               parse(stream, handler, metadata, new ParseContext());
+       }
+}
+---
+   
+   Pay special attention to the definition of the SUPPORTED_TYPES static class 
+   field in the parser class that defines what MIME-Types it supports. 
+   
+   Is in the "parse" method where you will do all your work. This is, extract 
+   the information of the resource and then set the metadata.
+
+* {List the new parser}
+
+   Finally, you should explicitly tell the AutoDetectParser to include your 
new 
+   parser. This step is only needed if you want to use the AutoDetectParser 
functionality. 
+   If you figure out the correct parser in a different way, it isn't needed. 
+   
+   List your new parser in:
+    
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+   
+

Modified: tika/trunk/src/site/site.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/src/site/site.xml?rev=964869&r1=964868&r2=964869&view=diff
==============================================================================
--- tika/trunk/src/site/site.xml (original)
+++ tika/trunk/src/site/site.xml Fri Jul 16 17:39:07 2010
@@ -20,13 +20,13 @@
 <project name="Apache Tika">
   <bannerLeft>
     <alt>Apache Tika</alt>
-    <src>http://lucene.apache.org/tika/tika.png</src>
-    <href>http://lucene.apache.org/tika/</href>
+    <src>http://tika.apache.org/tika.png</src>
+    <href>http://tika.apache.org</href>
   </bannerLeft>
   <bannerRight>
-    <alt>Apache Lucene</alt>
-    <src>http://lucene.apache.org/images/lucene_green_300.gif</src>
-    <href>http://lucene.apache.org/</href>
+    <alt>Apache</alt>
+    <src>http://www.apache.org/images/feather-small.gif</src>
+    <href>www.apache.org</href>
   </bannerRight>
   <body>
     <head>
@@ -40,6 +40,7 @@
       <item name="Getting Started" href="gettingstarted.html"/>
       <item name="Supported Formats" href="formats.html"/>
       <item name="Parser API" href="parser.html"/>
+      <item name="Parser 5min Quick Start Guide" href="parser_guide.html"/>
     </menu>
     <menu ref="reports"/>
   </body>


Reply via email to