Author: nick
Date: Sat Aug  1 15:24:45 2015
New Revision: 1693715

URL: http://svn.apache.org/r1693715
Log:
TIKA-1702 more documentation on configuration

Added:
    tika/site/publish/1.9/configuring.html
Modified:
    tika/site/src/site/apt/1.9/configuring.apt

Added: tika/site/publish/1.9/configuring.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.9/configuring.html?rev=1693715&view=auto
==============================================================================
--- tika/site/publish/1.9/configuring.html (added)
+++ tika/site/publish/1.9/configuring.html Sat Aug  1 15:24:45 2015
@@ -0,0 +1,394 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+
+
+
+
+
+
+<html xmlns="http://www.w3.org/1999/xhtml";>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <title>Apache Tika - Configuring Tika</title>
+    <style type="text/css" media="all">
+      @import url("../css/site.css");
+    </style>
+    <link rel="icon" type="image/png" href="../tikaNoText16.png" />
+    <script type="text/javascript">
+      function selectProvider(form) {
+        provider = form.elements['searchProvider'].value;
+        if (provider == "any") {
+          if (Math.random() > 0.5) {
+            provider = "lucid";
+          } else {
+            provider = "sl";
+          }
+        }
+        if (provider == "lucid") {
+          form.action = "http://find.searchhub.org/p:tika";;
+        } else if (provider == "sl") {
+          form.action = "http://search-lucene.com/tika";;
+        }
+        days = 90;
+        date = new Date();
+        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+        expires = "; expires=" + date.toGMTString();
+        document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      }
+      function initProvider() {
+        if (document.cookie.length>0) {
+          cStart=document.cookie.indexOf("searchProvider=");
+          if (cStart!=-1) {
+            cStart=cStart + "searchProvider=".length;
+            cEnd=document.cookie.indexOf(";", cStart);
+            if (cEnd==-1) {
+              cEnd=document.cookie.length;
+            }
+            provider = unescape(document.cookie.substring(cStart,cEnd));
+            document.forms['searchform'].elements['searchProvider'].value = 
provider;
+          }
+        }
+        document.forms['searchform'].elements['q'].focus();
+      }
+    </script>
+  </head>
+  <body onLoad="initProvider();">
+    <div id="body">
+      <div id="banner">
+        <a href="http://tika.apache.org"; id="bannerLeft" title="Apache Tika"
+          ><img src="http://tika.apache.org/tika.png"; alt="Apache Tika"
+                width="292" height="100"/></a>
+        <a href="http://www.apache.org/"; id="bannerRight"
+           title="The Apache Software Foundation"
+          ><img src="http://tika.apache.org/asf-logo.gif"; alt="The Apache 
Software Foundation"
+                width="387" height="100"/></a>
+      </div>
+      <div id="content">
+        <!-- Licensed to the Apache Software Foundation (ASF) under one or 
more --><!-- contributor license agreements.  See the NOTICE file distributed 
with --><!-- this work for additional information regarding copyright 
ownership. --><!-- The ASF licenses this file to You under the Apache License, 
Version 2.0 --><!-- (the "License"); you may not use this file except in 
compliance with --><!-- the License.  You may obtain a copy of the License at 
--><!--  --><!-- http://www.apache.org/licenses/LICENSE-2.0 --><!--  --><!-- 
Unless required by applicable law or agreed to in writing, software --><!-- 
distributed under the License is distributed on an "AS IS" BASIS, --><!-- 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
--><!-- See the License for the specific language governing permissions and 
--><!-- limitations under the License. --><div class="section">
+<h2>Configuring Tika<a name="Configuring_Tika"></a></h2>
+<p>Out of the box, Apache Tika will attempt to start with all available 
Detectors and Parsers, running with sensible defaults. For most users, this 
default configuration will work well.</p>
+<p>This page gives you information on how to configure the various components 
of Apache Tika, such as Parsers and Detectors, if you need fine-grained control 
over ordering, exclusions and the like.</p>
+<ul>
+<li><a href="#Configuring_Tika">Configuring Tika</a>
+<ul>
+<li><a href="#Configuring_Parsers">Configuring Parsers</a></li>
+<li><a href="#Configuring_Detectors">Configuring Detectors</a></li>
+<li><a href="#Configuring_Mime_Types">Configuring Mime Types</a></li>
+<li><a href="#Configuring_Language_Identifiers">Configuring Language 
Identifiers</a></li>
+<li><a href="#Configuring_Translators">Configuring Translators</a></li>
+<li><a href="#Using_a_Tika_Configuration_XML_file">Using a Tika Configuration 
XML file</a></li></ul></li></ul>
+<div class="section">
+<h3><a name="Configuring_Parsers">Configuring Parsers</a></h3><!-- TODO Add 
more on in 1.10, which has more support -->
+<p>In Tika 1.9, there is some support for configuring Parsers in the Tika 
Config xml. You can provide a custom list of parser to use, in a custom order, 
and you can also force certain mimetypes to be used or not-used for parsers. 
You can do so with Tika Config something like:</p>
+<div>
+<pre>&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
+&lt;properties&gt;
+  &lt;parsers&gt;
+    &lt;!-- Default Parser for most things, except for 2 mime types --&gt;
+    &lt;parser class=&quot;org.apache.tika.parser.DefaultParser&quot;&gt;
+      &lt;mime-exclude&gt;image/jpeg&lt;/mime-exclude&gt;
+      &lt;mime-exclude&gt;application/pdf&lt;/mime-exclude&gt;
+    &lt;/parser&gt;
+    &lt;!-- Use a different parser for PDF --&gt;
+    &lt;parser class=&quot;org.apache.tika.parser.EmptyParser&quot;&gt;
+      &lt;mime&gt;application/pdf&lt;/mime&gt;
+    &lt;/parser&gt;
+  &lt;/parsers&gt;
+&lt;/properties&gt;</pre></div>
+<p>In code, the key classes to use to build up your own custom parser 
heirarchy are <a 
href="./api/org/apache/tika/parser/DefaultParser.html">org.apache.tika.parser.DefaultParser</a>,
 <a 
href="./api/org/apache/tika/parser/CompositeParser.html">org.apache.tika.parser.CompositeParser</a>
 and <a 
href="./api/org/apache/tika/parser/ParserDecorator.html">org.apache.tika.parser.ParserDecorator</a>.</p></div>
+<div class="section">
+<h3><a name="Configuring_Detectors">Configuring Detectors</a></h3><!-- TODO 
Add more on in 1.10, which has more support -->
+<p>In Tika 1.9, there is limited support for configuring Detectors in the Tika 
Config xml. You can provide a custom list of detectors to use, in a custom 
order, with Tika Config something like:</p>
+<div>
+<pre>&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
+&lt;properties&gt;
+  &lt;detectors&gt;
+    &lt;!-- Only use these two detectors, and ignore all others --&gt;
+    &lt;detector 
class=&quot;org.apache.tika.parser.pkg.ZipContainerDetector&quot;/&gt;
+    &lt;detector class=&quot;org.apache.tika.mime.MimeTypes&quot;/&gt;
+  &lt;/detectors&gt;
+&lt;/properties&gt;</pre></div>
+<p>In code, the key classes to use to build up your own custom detector 
heirarchy are <a 
href="./api/org/apache/tika/detect/DefaultDetector.html">org.apache.tika.detect.DefaultDetector</a>
 and <a 
href="./api/org/apache/tika/detect/CompositeDetector.html">org.apache.tika.detect.CompositeDetector</a>.</p></div>
+<div class="section">
+<h3><a name="Configuring_Mime_Types">Configuring Mime Types</a></h3>
+<p>TODO Mention non-standard paths, and custom mime type files</p></div>
+<div class="section">
+<h3><a name="Configuring_Language_Identifiers">Configuring Language 
Identifiers</a></h3>
+<p>At this time, there is no unified way to configure language identifiers. 
While the work on that is ongoing, for now you will need to review the <a 
href="./api/">Tika Javadocs</a> to see how individual identifiers are 
configured.</p></div>
+<div class="section">
+<h3><a name="Configuring_Translators">Configuring Translators</a></h3>
+<p>At this time, there is no unified way to configure Translators. While the 
work on that is ongoing, for now you will need to review the <a 
href="./api/">Tika Javadocs</a> to see how individual Translators are 
configured.</p></div>
+<div class="section">
+<h3><a name="Using_a_Tika_Configuration_XML_file">Using a Tika Configuration 
XML file</a></h3>
+<p>However you call Tika, the System Property of <tt> tika.config </tt> is 
checked first, and the Environment Variable of <tt> TIKA_CONFIG </tt> is tried 
next. Setting one of those will cause Tika to use your given Tika Config XML 
file.</p>
+<p>If you are calling Tika from your own code, then you can pass in the 
location of your Tika Config XML file when you construct your 
<tt>TikaConfig</tt> instance. From that, you can fetch your configured parser, 
detectors etc.</p>
+<div>
+<pre>TikaConfig config = new TikaConfig(&quot;/path/to/tika-config.xml&quot;);
+Detector detector = config.getDetector();
+Parser autoDetectParser = new AutoDetectParser(config);</pre></div>
+<p>For users of the Tika App, in addition to the sytem property and the 
environement variable, you can also use the <tt> --config=[tika-config.xml] 
</tt> option to select a different Tika Config XML file to use</p>
+<p>For users of the Tika Server, in addition to the sytem property and the 
environement variable, you can also use <tt> -c [tika-config.xml] </tt> or <tt> 
--config [tika-config.xml] </tt> options to select a different Tika Config XML 
file to use</p></div></div>
+      </div>
+      <div id="sidebar">
+        <div id="navigation">
+                    <h5>Apache Tika</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="../index.html">Introduction</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../download.html">Download</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../contribute.html">Contribute</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../mail-lists.html">Mailing Lists</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://wiki.apache.org/tika/"; 
class="externalLink">Tika Wiki</a>
+          </li>
+              
+    <li class="none">
+                    <a href="https://issues.apache.org/jira/browse/TIKA"; 
class="externalLink">Issue Tracker</a>
+          </li>
+          </ul>
+              <h5>Documentation</h5>
+            <ul>
+              
+          
+                    
+                  
+                  
+                  
+                  
+                        
+                  
+                  
+              
+            <li class="expanded">
+                    <a href="../1.9/index.html">Apache Tika 1.9</a>
+                  <ul>
+                  
+    <li class="none">
+                    <a href="../1.9/gettingstarted.html">Getting Started</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/formats.html">Supported Formats</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/parser.html">Parser API</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/parser_guide.html">Parser 5min Quick Start 
Guide</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/detection.html">Content and Language 
Detection</a>
+          </li>
+                  
+    <li class="none">
+              <strong>Configuring Tika</strong>
+        </li>
+                  
+    <li class="none">
+                    <a href="../1.9/examples.html">Usage Examples</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/api/">API Documentation</a>
+          </li>
+              </ul>
+        </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.8/index.html">Apache Tika 1.8</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.7/index.html">Apache Tika 1.7</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.6/index.html">Apache Tika 1.6</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.5/index.html">Apache Tika 1.5</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.4/index.html">Apache Tika 1.4</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.3/index.html">Apache Tika 1.3</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.2/index.html">Apache Tika 1.2</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.1/index.html">Apache Tika 1.1</a>
+                </li>
+          </ul>
+              <h5>The Apache Software Foundation</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/"; 
class="externalLink">About</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/licenses/"; 
class="externalLink">License</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/security/"; 
class="externalLink">Security</a>
+          </li>
+              
+    <li class="none">
+                    <a 
href="http://www.apache.org/foundation/sponsorship.html"; 
class="externalLink">Sponsorship</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/thanks.html"; 
class="externalLink">Thanks</a>
+          </li>
+          </ul>
+      
+          <div id="search">
+            <h5>Search with Apache Solr</h5>
+            <form action="http://search.lucidimagination.com/p:tika";
+                  method="get" id="searchform">
+              <input type="text" id="query" name="q"/>
+              <select name="searchProvider" id="searchProvider">
+                <option value="any">provider</option>
+                <option value="lucid">Lucid Find</option>
+                <option value="sl">Search-Lucene</option>
+              </select>
+              <input type="submit" id="submit" value="Search" name="Search"
+                     onclick="selectProvider(this.form)"/>
+            </form>
+          </div>
+
+          <div id="bookpromo">
+            <h5>Books about Tika</h5>
+            <p>
+              <a href="http://manning.com/mattmann/"; title="Tika in Action"
+                ><img src="../mattmann_cover150.jpg"
+                      width="150" height="186"/></a>
+            </p>
+          </div>
+        </div>
+      </div>
+      <div id="footer">
+        <p>
+          Copyright &#169; 2015
+          <a href="http://www.apache.org/";>The Apache Software Foundation</a>.
+          Site powered by <a href="http://maven.apache.org/";>Apache Maven</a>. 
+          Search powered by
+          <a href="http://www.lucidimagination.com";>Lucid Imagination</a>
+          and <a href="http://sematext.com";>Sematext</a>.
+          <br/>
+          Apache Tika, Tika, Apache, the Apache feather logo, and the Apache
+          Tika project logo are trademarks of The Apache Software Foundation.
+        </p>
+      </div>
+    </div>
+  </body>
+</html>

Modified: tika/site/src/site/apt/1.9/configuring.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.9/configuring.apt?rev=1693715&r1=1693714&r2=1693715&view=diff
==============================================================================
--- tika/site/src/site/apt/1.9/configuring.apt (original)
+++ tika/site/src/site/apt/1.9/configuring.apt Sat Aug  1 15:24:45 2015
@@ -31,7 +31,29 @@ Configuring Tika
 
 * {Configuring Parsers}
 
-    TODO
+~~ TODO Add more on in 1.10, which has more support
+
+    In Tika 1.9, there is some support for configuring Parsers in the Tika 
Config 
+    xml. You can provide a custom list of parser to use, in a custom order, 
and you
+    can also force certain mimetypes to be used or not-used for parsers. You 
can do
+    so with Tika Config something like:
+
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+  <parsers>
+    <!-- Default Parser for most things, except for 2 mime types -->
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <mime-exclude>image/jpeg</mime-exclude>
+      <mime-exclude>application/pdf</mime-exclude>
+    </parser>
+    <!-- Use a different parser for PDF -->
+    <parser class="org.apache.tika.parser.EmptyParser">
+      <mime>application/pdf</mime>
+    </parser>
+  </parsers>
+</properties>
+---
 
     In code, the key classes to use to build up your own custom parser
     heirarchy are 
@@ -42,7 +64,22 @@ Configuring Tika
 
 * {Configuring Detectors}
 
-    TODO
+~~ TODO Add more on in 1.10, which has more support
+
+    In Tika 1.9, there is limited support for configuring Detectors in the 
Tika Config 
+    xml. You can provide a custom list of detectors to use, in a custom order, 
with
+    Tika Config something like:
+
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+  <detectors>
+    <!-- Only use these two detectors, and ignore all others -->
+    <detector class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+    <detector class="org.apache.tika.mime.MimeTypes"/>
+  </detectors>
+</properties>
+---
 
     In code, the key classes to use to build up your own custom detector
     heirarchy are 
@@ -52,27 +89,32 @@ Configuring Tika
 
 * {Configuring Mime Types}
 
-    TODO
+    TODO Mention non-standard paths, and custom mime type files
 
 * {Configuring Language Identifiers}
 
-    TODO
+    At this time, there is no unified way to configure language identifiers.
+    While the work on that is ongoing, for now you will need to review the
+    {{{./api/}Tika Javadocs}} to see how individual identifiers are configured.
 
 * {Configuring Translators}
 
-    TODO
+    At this time, there is no unified way to configure Translators.
+    While the work on that is ongoing, for now you will need to review the
+    {{{./api/}Tika Javadocs}} to see how individual Translators are configured.
 
 * {Using a Tika Configuration XML file}
 
-    However you call Tika, the System Property of <pre>tika.config</pre> is
-    checked first, and the Environment Variable of <pre>TIKA_CONFIG</pre> is
+    However you call Tika, the System Property of <<< tika.config >>> is
+    checked first, and the Environment Variable of <<< TIKA_CONFIG >>> is
     tried next. Setting one of those will cause Tika to use your given
     Tika Config XML file.
 
     If you are calling Tika from your own code, then you can pass in the
     location of your Tika Config XML file when you construct your 
-    <pre>TikaConfig</pre> instance. From that, you can fetch your configured
+    <<<TikaConfig>>> instance. From that, you can fetch your configured
     parser, detectors etc.
+
 ---
 TikaConfig config = new TikaConfig("/path/to/tika-config.xml");
 Detector detector = config.getDetector();
@@ -81,10 +123,10 @@ Parser autoDetectParser = new AutoDetect
 
     For users of the Tika App, in addition to the sytem property and the
     environement variable, you can also use the 
-    <pre>--config=&gt;tika-config.xml&lt;</pre> option to select a different
+    <<< --config=[tika-config.xml] >>> option to select a different
     Tika Config XML file to use
 
     For users of the Tika Server, in addition to the sytem property and the
-    environement variable, you can also use <pre>-c 
&gt;tika-config.xml&lt;</pre> 
-    or <pre>--config &gt;tika-config.xml&lt;</pre> options to select a 
different
+    environement variable, you can also use <<< -c [tika-config.xml] >>> or
+    <<< --config [tika-config.xml] >>> options to select a different
     Tika Config XML file to use


Reply via email to