Added: tika/site/src/site/apt/1.12/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.12/parser_guide.apt?rev=1731321&view=auto ============================================================================== --- tika/site/src/site/apt/1.12/parser_guide.apt (added) +++ tika/site/src/site/apt/1.12/parser_guide.apt Sat Feb 20 00:42:12 2016 @@ -0,0 +1,141 @@ + -------------------------------------------- + Get Tika parsing up and running in 5 minutes + -------------------------------------------- + +~~ Licensed to the Apache Software Foundation (ASF) under one or more +~~ contributor license agreements. See the NOTICE file distributed with +~~ this work for additional information regarding copyright ownership. +~~ The ASF licenses this file to You under the Apache License, Version 2.0 +~~ (the "License"); you may not use this file except in compliance with +~~ the License. You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, software +~~ distributed under the License is distributed on an "AS IS" BASIS, +~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +~~ See the License for the specific language governing permissions and +~~ limitations under the License. + +Get Tika parsing up and running in 5 minutes + + This page is a quick start guide showing how to add a new parser to Apache Tika. + Following the simple steps listed below your new parser can be running in only 5 minutes. + +%{toc|section=1|fromDepth=1} + +* {Getting Started} + + The {{{./gettingstarted.html}Getting Started}} document describes how to + build Apache Tika from sources and how to start using Tika in an application. Pay close attention + and follow the instructions in the "Getting and building the sources" section. + + +* {Add your MIME-Type} + + Tika loads the core, standard MIME-Types from the file + "org/apache/tika/mime/tika-mimetypes.xml", which comes from + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} . + If your new MIME-Type is a standard one which is missing from Tika, + submit a patch for this file! + + If your MIME-Type needs adding, create a new file + "org/apache/tika/mime/custom-mimetypes.xml" in your codebase. + You should add to it something like this: + +--- + <?xml version="1.0" encoding="UTF-8"?> + <mime-info> + <mime-type type="application/hello"> + <glob pattern="*.hi"/> + </mime-type> + </mime-info> +--- + +* {Create your Parser class} + + Now, you need to create your new parser. This is a class that must + implement the Parser interface offered by Tika. Instead of implementing + the Parser interface directly, it is recommended that you extend the + abstract class AbstractParser if possible. AbstractParser handles + translating between API changes for you. + + A very simple Tika Parser looks like this: + +--- +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @Author: Arturo Beltran + */ +package org.apache.tika.parser.hello; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class HelloParser extends AbstractParser { + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("hello")); + public static final String HELLO_MIME_TYPE = "application/hello"; + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE); + metadata.set("Hello", "World"); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.endDocument(); + } +} +--- + + Pay special attention to the definition of the SUPPORTED_TYPES static class + field in the parser class that defines what MIME-Types it supports. If + your MIME-Types aren't standard ones, ensure you listed them in a + "custom-mimetypes.xml" file so that Tika knows about them (see above). + + Is in the "parse" method where you will do all your work. This is, extract + the information of the resource and then set the metadata. + +* {List the new parser} + + Finally, you should explicitly tell the AutoDetectParser to include your new + parser. This step is only needed if you want to use the AutoDetectParser functionality. + If you figure out the correct parser in a different way, it isn't needed. + + List your new parser in: + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} + +
Modified: tika/site/src/site/apt/download.apt.vm URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/download.apt.vm?rev=1731321&r1=1731320&r2=1731321&view=diff ============================================================================== --- tika/site/src/site/apt/download.apt.vm (original) +++ tika/site/src/site/apt/download.apt.vm Sat Feb 20 00:42:12 2016 @@ -25,18 +25,18 @@ Download Apache Tika * {{{http://www.apache.org/dyn/closer.cgi/tika/tika-${project.parent.version}-src.zip}Mirrors for apache-tika-${project.parent.version}-src.zip}} (source archive, {{{http://www.apache.org/dist/tika/tika-${project.parent.version}-src.zip.asc}PGP signature}})\ - SHA1: <<<d0dde7b3a4f1a2fb6ccd741552ea180dddab630a>>>\ - MD5: <<<ccca11a7e5c300e438b2a52012cf4e39>>> + SHA1: <<<30e64645af643959841ac3bb3c41f7e64eba7e5f>>>\ + MD5: <<<ccf8adb2260476244618a488a905490b>>> * {{{http://www.apache.org/dyn/closer.cgi/tika/tika-app-${project.parent.version}.jar}Mirrors for tika-app-${project.parent.version}.jar}} (runnable jar, {{{http://www.apache.org/dist/tika/tika-app-${project.parent.version}.jar.asc}PGP signature}})\ - SHA1: <<<59cc7c4c48a6a41899ca282d925b2738d05a45a8>>>\ - MD5: <<<3e133bcb3cd709fddd1bda3eebc1a0e5>>>\ + SHA1: <<<8d5c5f9e14b53a807a9d3d99ef34e63c38b9b418>>>\ + MD5: <<<bf0346321c71ff62f514e096086f5346>>>\ * {{{http://www.apache.org/dyn/closer.cgi/tika/tika-server-${project.parent.version}.jar}Mirrors for tika-server-${project.parent.version}.jar}} (runnable jar, {{{http://www.apache.org/dist/tika/tika-server-${project.parent.version}.jar.asc}PGP signature}})\ - SHA1: <<<c1ca6453573fb7fa1f6b3d81dc4c9847a9a86a62>>>\ - MD5: <<<7e28f3288c3bcd0c26ac6f557ddfb977>>> + SHA1: <<<e9655cbf4f15e9d2934d697708b66d9eeeca4ee1>>>\ + MD5: <<<cf34921c57ef5d6002f3088536d2f2ed>>> [] Modified: tika/site/src/site/apt/index.apt.vm URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/index.apt.vm?rev=1731321&r1=1731320&r2=1731321&view=diff ============================================================================== --- tika/site/src/site/apt/index.apt.vm (original) +++ tika/site/src/site/apt/index.apt.vm Sat Feb 20 00:42:12 2016 @@ -39,6 +39,15 @@ Apache Tika - a content analysis toolkit Latest News + [19 February 2016: Apache Tika Release] + Apache Tika 1.12 has been released! This release includes some improvements + to Named Entity Recognition (Stanford NER integration and Apache OpenNLP) + and additionally efficiency improvements to the GeoTopicParser. There are + also bugfixes to Tika REST server in this release. Please see the + {{{https://dist.apache.org/repos/dist/release/tika/CHANGES-1.12.txt}CHANGES.txt}} + file for a full list of changes in this release and have a look at the download + page for more information on how to obtain Apache Tika 1.12. + [25 October 2015: Apache Tika Release] Apache Tika 1.11 has been released! This release includes several improvements that better utilize Java7 support, that help extract more content using the Modified: tika/site/src/site/site.xml URL: http://svn.apache.org/viewvc/tika/site/src/site/site.xml?rev=1731321&r1=1731320&r2=1731321&view=diff ============================================================================== --- tika/site/src/site/site.xml (original) +++ tika/site/src/site/site.xml Sat Feb 20 00:42:12 2016 @@ -40,7 +40,17 @@ <item name="Issue Tracker" href="https://issues.apache.org/jira/browse/TIKA"/> </menu> <menu name="Documentation"> - <item name="Apache Tika 1.11" href="1.11/index.html"> + <item name="Apache Tika 1.12" href="1.12/index.html"> + <item name="Getting Started" href="1.12/gettingstarted.html"/> + <item name="Supported Formats" href="1.12/formats.html"/> + <item name="Parser API" href="1.12/parser.html"/> + <item name="Parser 5min Quick Start Guide" href="1.12/parser_guide.html"/> + <item name="Content and Language Detection" href="1.12/detection.html"/> + <item name="Configuring Tika" href="1.12/configuring.html"/> + <item name="Usage Examples" href="1.12/examples.html"/> + <item name="API Documentation" href="1.12/api/"/> + </item> + <item name="Apache Tika 1.11" href="1.11/index.html" collapse="true"> <item name="Getting Started" href="1.11/gettingstarted.html"/> <item name="Supported Formats" href="1.11/formats.html"/> <item name="Parser API" href="1.11/parser.html"/> @@ -70,15 +80,6 @@ <item name="Usage Examples" href="1.9/examples.html"/> <item name="API Documentation" href="1.9/api/"/> </item> - <item name="Apache Tika 1.8" href="1.8/index.html" collapse="true"> - <item name="Getting Started" href="1.8/gettingstarted.html"/> - <item name="Supported Formats" href="1.8/formats.html"/> - <item name="Parser API" href="1.8/parser.html"/> - <item name="Parser 5min Quick Start Guide" href="1.8/parser_guide.html"/> - <item name="Content and Language Detection" href="1.8/detection.html"/> - <item name="Usage Examples" href="1.8/examples.html"/> - <item name="API Documentation" href="1.8/api/"/> - </item> </menu> <menu name="The Apache Software Foundation"> <item name="About" href="http://www.apache.org/foundation/"/>
