Author: nick
Date: Sat Aug  1 19:49:31 2015
New Revision: 1693752

URL: http://svn.apache.org/r1693752
Log:
Update the configuration docs for 1.10

Added:
    tika/site/publish/1.10/configuring.html
    tika/site/src/site/apt/1.10/configuring.apt
      - copied, changed from r1693715, 
tika/site/src/site/apt/1.9/configuring.apt

Added: tika/site/publish/1.10/configuring.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.10/configuring.html?rev=1693752&view=auto
==============================================================================
--- tika/site/publish/1.10/configuring.html (added)
+++ tika/site/publish/1.10/configuring.html Sat Aug  1 19:49:31 2015
@@ -0,0 +1,413 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+
+
+
+
+
+
+<html xmlns="http://www.w3.org/1999/xhtml";>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <title>Apache Tika - Configuring Tika</title>
+    <style type="text/css" media="all">
+      @import url("../css/site.css");
+    </style>
+    <link rel="icon" type="image/png" href="../tikaNoText16.png" />
+    <script type="text/javascript">
+      function selectProvider(form) {
+        provider = form.elements['searchProvider'].value;
+        if (provider == "any") {
+          if (Math.random() > 0.5) {
+            provider = "lucid";
+          } else {
+            provider = "sl";
+          }
+        }
+        if (provider == "lucid") {
+          form.action = "http://find.searchhub.org/p:tika";;
+        } else if (provider == "sl") {
+          form.action = "http://search-lucene.com/tika";;
+        }
+        days = 90;
+        date = new Date();
+        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+        expires = "; expires=" + date.toGMTString();
+        document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      }
+      function initProvider() {
+        if (document.cookie.length>0) {
+          cStart=document.cookie.indexOf("searchProvider=");
+          if (cStart!=-1) {
+            cStart=cStart + "searchProvider=".length;
+            cEnd=document.cookie.indexOf(";", cStart);
+            if (cEnd==-1) {
+              cEnd=document.cookie.length;
+            }
+            provider = unescape(document.cookie.substring(cStart,cEnd));
+            document.forms['searchform'].elements['searchProvider'].value = 
provider;
+          }
+        }
+        document.forms['searchform'].elements['q'].focus();
+      }
+    </script>
+  </head>
+  <body onLoad="initProvider();">
+    <div id="body">
+      <div id="banner">
+        <a href="http://tika.apache.org"; id="bannerLeft" title="Apache Tika"
+          ><img src="http://tika.apache.org/tika.png"; alt="Apache Tika"
+                width="292" height="100"/></a>
+        <a href="http://www.apache.org/"; id="bannerRight"
+           title="The Apache Software Foundation"
+          ><img src="http://tika.apache.org/asf-logo.gif"; alt="The Apache 
Software Foundation"
+                width="387" height="100"/></a>
+      </div>
+      <div id="content">
+        <!-- Licensed to the Apache Software Foundation (ASF) under one or 
more --><!-- contributor license agreements.  See the NOTICE file distributed 
with --><!-- this work for additional information regarding copyright 
ownership. --><!-- The ASF licenses this file to You under the Apache License, 
Version 2.0 --><!-- (the "License"); you may not use this file except in 
compliance with --><!-- the License.  You may obtain a copy of the License at 
--><!--  --><!-- http://www.apache.org/licenses/LICENSE-2.0 --><!--  --><!-- 
Unless required by applicable law or agreed to in writing, software --><!-- 
distributed under the License is distributed on an "AS IS" BASIS, --><!-- 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
--><!-- See the License for the specific language governing permissions and 
--><!-- limitations under the License. --><div class="section">
+<h2>Configuring Tika<a name="Configuring_Tika"></a></h2>
+<p>Out of the box, Apache Tika will attempt to start with all available 
Detectors and Parsers, running with sensible defaults. For most users, this 
default configuration will work well.</p>
+<p>This page gives you information on how to configure the various components 
of Apache Tika, such as Parsers and Detectors, if you need fine-grained control 
over ordering, exclusions and the like.</p>
+<ul>
+<li><a href="#Configuring_Tika">Configuring Tika</a>
+<ul>
+<li><a href="#Configuring_Parsers">Configuring Parsers</a></li>
+<li><a href="#Configuring_Detectors">Configuring Detectors</a></li>
+<li><a href="#Configuring_Mime_Types">Configuring Mime Types</a></li>
+<li><a href="#Configuring_Language_Identifiers">Configuring Language 
Identifiers</a></li>
+<li><a href="#Configuring_Translators">Configuring Translators</a></li>
+<li><a href="#Using_a_Tika_Configuration_XML_file">Using a Tika Configuration 
XML file</a></li></ul></li></ul>
+<div class="section">
+<h3><a name="Configuring_Parsers">Configuring Parsers</a></h3>
+<p>Through the Tika Config xml, it is possible to have a high degree of 
control over which parsers are or aren't used, in what order of preferences 
etc. It is also possible to override just certain parts, to (for example) have 
&quot;default except for PDF&quot;.</p>
+<p>Currently, it is only possible to have a single parser run against a 
document. There is on-going discussion around fallback parsers and combining 
the output of multiple parsers running on a document, but none of these are 
available yet.</p>
+<p>To override some parser certain default behaviours, include the <a 
href="#DefaultParser"></a> in your configuration, with excludes, then add other 
parser definitions in. To prevent the <a href="#DefaultParser"></a> (with its 
auto-discovery) being used, simply omit it from your config, and list all other 
parsers you want instead.</p>
+<p>To override just some default behaviour, you can use a Tika Config 
something like this:</p>
+<div>
+<pre>&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
+&lt;properties&gt;
+  &lt;parsers&gt;
+    &lt;!-- Default Parser for most things, except for 2 mime types, and never
+         use the Executable Parser --&gt;
+    &lt;parser class=&quot;org.apache.tika.parser.DefaultParser&quot;&gt;
+      &lt;mime-exclude&gt;image/jpeg&lt;/mime-exclude&gt;
+      &lt;mime-exclude&gt;application/pdf&lt;/mime-exclude&gt;
+      &lt;parser-exclude 
class=&quot;org.apache.tika.parser.executable.ExecutableParser&quot;/&gt;
+    &lt;/parser&gt;
+    &lt;!-- Use a different parser for PDF --&gt;
+    &lt;parser class=&quot;org.apache.tika.parser.EmptyParser&quot;&gt;
+      &lt;mime&gt;application/pdf&lt;/mime&gt;
+    &lt;/parser&gt;
+  &lt;/parsers&gt;
+&lt;/properties&gt;</pre></div>
+<p>To configure things in code, the key classes to use to build up your own 
custom parser heirarchy are <a 
href="./api/org/apache/tika/parser/DefaultParser.html">org.apache.tika.parser.DefaultParser</a>,
 <a 
href="./api/org/apache/tika/parser/CompositeParser.html">org.apache.tika.parser.CompositeParser</a>
 and <a 
href="./api/org/apache/tika/parser/ParserDecorator.html">org.apache.tika.parser.ParserDecorator</a>.</p></div>
+<div class="section">
+<h3><a name="Configuring_Detectors">Configuring Detectors</a></h3>
+<p>Through the Tika Config xml, it is possible to have a high degree of 
control over which detectors are or aren't used, in what order of preferences 
etc. It is also possible to override just certain parts, to (for example) have 
&quot;default except for no POIFS Container Detction&quot;.</p>
+<p>To override some detector certain default behaviours, include the <a 
href="#DefaultDetector"></a>, with any <a href="#detector-exclude"></a> entries 
you need, in your configuration, then add other detectors definitions in. To 
prevent the <a href="#DefaultParser"></a> (with its auto-discovery) being used, 
simply omit it from your config, and list all other detectors you want 
instead.</p>
+<p>To override just some default behaviour, you can use a Tika Config 
something like this:</p>
+<div>
+<pre>&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
+&lt;properties&gt;
+  &lt;detectors&gt;
+    &lt;!-- All detectors except built-in container ones --&gt;
+    &lt;detector class=&quot;org.apache.tika.detect.DefaultDetector&quot;&gt;
+      &lt;detector-exclude 
class=&quot;org.apache.tika.parser.pkg.ZipContainerDetector&quot;/&gt;
+      &lt;detector-exclude 
class=&quot;org.apache.tika.parser.microsoft.POIFSContainerDetector&quot;/&gt;
+    &lt;/detector&gt;
+  &lt;/detectors&gt;
+&lt;/properties&gt;</pre></div>
+<p>Or to just only use certain detectors, you can use a Tika Config something 
like this:</p>
+<div>
+<pre>&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
+&lt;properties&gt;
+  &lt;detectors&gt;
+    &lt;!-- Only use these two detectors, and ignore all others --&gt;
+    &lt;detector 
class=&quot;org.apache.tika.parser.pkg.ZipContainerDetector&quot;/&gt;
+    &lt;detector class=&quot;org.apache.tika.mime.MimeTypes&quot;/&gt;
+  &lt;/detectors&gt;
+&lt;/properties&gt;</pre></div>
+<p>In code, the key classes to use to build up your own custom detector 
heirarchy are <a 
href="./api/org/apache/tika/detect/DefaultDetector.html">org.apache.tika.detect.DefaultDetector</a>
 and <a 
href="./api/org/apache/tika/detect/CompositeDetector.html">org.apache.tika.detect.CompositeDetector</a>.</p></div>
+<div class="section">
+<h3><a name="Configuring_Mime_Types">Configuring Mime Types</a></h3>
+<p>TODO Mention non-standard paths, and custom mime type files</p></div>
+<div class="section">
+<h3><a name="Configuring_Language_Identifiers">Configuring Language 
Identifiers</a></h3>
+<p>At this time, there is no unified way to configure language identifiers. 
While the work on that is ongoing, for now you will need to review the <a 
href="./api/">Tika Javadocs</a> to see how individual identifiers are 
configured.</p></div>
+<div class="section">
+<h3><a name="Configuring_Translators">Configuring Translators</a></h3>
+<p>At this time, there is no unified way to configure Translators. While the 
work on that is ongoing, for now you will need to review the <a 
href="./api/">Tika Javadocs</a> to see how individual Translators are 
configured.</p><!-- When Translators can have their parameters configured, 
mention here about --><!-- specifying which single one to use in the Tika 
Config XML --></div>
+<div class="section">
+<h3><a name="Using_a_Tika_Configuration_XML_file">Using a Tika Configuration 
XML file</a></h3>
+<p>However you call Tika, the System Property of <tt> tika.config </tt> is 
checked first, and the Environment Variable of <tt> TIKA_CONFIG </tt> is tried 
next. Setting one of those will cause Tika to use your given Tika Config XML 
file.</p>
+<p>If you are calling Tika from your own code, then you can pass in the 
location of your Tika Config XML file when you construct your 
<tt>TikaConfig</tt> instance. From that, you can fetch your configured parser, 
detectors etc.</p>
+<div>
+<pre>TikaConfig config = new TikaConfig(&quot;/path/to/tika-config.xml&quot;);
+Detector detector = config.getDetector();
+Parser autoDetectParser = new AutoDetectParser(config);</pre></div>
+<p>For users of the Tika App, in addition to the sytem property and the 
environement variable, you can also use the <tt> --config=[tika-config.xml] 
</tt> option to select a different Tika Config XML file to use</p>
+<p>For users of the Tika Server, in addition to the sytem property and the 
environement variable, you can also use <tt> -c [tika-config.xml] </tt> or <tt> 
--config [tika-config.xml] </tt> options to select a different Tika Config XML 
file to use</p></div></div>
+      </div>
+      <div id="sidebar">
+        <div id="navigation">
+                    <h5>Apache Tika</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="../index.html">Introduction</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../download.html">Download</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../contribute.html">Contribute</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../mail-lists.html">Mailing Lists</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://wiki.apache.org/tika/"; 
class="externalLink">Tika Wiki</a>
+          </li>
+              
+    <li class="none">
+                    <a href="https://issues.apache.org/jira/browse/TIKA"; 
class="externalLink">Issue Tracker</a>
+          </li>
+          </ul>
+              <h5>Documentation</h5>
+            <ul>
+              
+          
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="expanded">
+                    <a href="../1.9/index.html">Apache Tika 1.9</a>
+                  <ul>
+                  
+    <li class="none">
+                    <a href="../1.9/gettingstarted.html">Getting Started</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/formats.html">Supported Formats</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/parser.html">Parser API</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/parser_guide.html">Parser 5min Quick Start 
Guide</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/detection.html">Content and Language 
Detection</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/configuring.html">Configuring Tika</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/examples.html">Usage Examples</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.9/api/">API Documentation</a>
+          </li>
+              </ul>
+        </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.8/index.html">Apache Tika 1.8</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.7/index.html">Apache Tika 1.7</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.6/index.html">Apache Tika 1.6</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.5/index.html">Apache Tika 1.5</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.4/index.html">Apache Tika 1.4</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.3/index.html">Apache Tika 1.3</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.2/index.html">Apache Tika 1.2</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.1/index.html">Apache Tika 1.1</a>
+                </li>
+          </ul>
+              <h5>The Apache Software Foundation</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/"; 
class="externalLink">About</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/licenses/"; 
class="externalLink">License</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/security/"; 
class="externalLink">Security</a>
+          </li>
+              
+    <li class="none">
+                    <a 
href="http://www.apache.org/foundation/sponsorship.html"; 
class="externalLink">Sponsorship</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/thanks.html"; 
class="externalLink">Thanks</a>
+          </li>
+          </ul>
+      
+          <div id="search">
+            <h5>Search with Apache Solr</h5>
+            <form action="http://search.lucidimagination.com/p:tika";
+                  method="get" id="searchform">
+              <input type="text" id="query" name="q"/>
+              <select name="searchProvider" id="searchProvider">
+                <option value="any">provider</option>
+                <option value="lucid">Lucid Find</option>
+                <option value="sl">Search-Lucene</option>
+              </select>
+              <input type="submit" id="submit" value="Search" name="Search"
+                     onclick="selectProvider(this.form)"/>
+            </form>
+          </div>
+
+          <div id="bookpromo">
+            <h5>Books about Tika</h5>
+            <p>
+              <a href="http://manning.com/mattmann/"; title="Tika in Action"
+                ><img src="../mattmann_cover150.jpg"
+                      width="150" height="186"/></a>
+            </p>
+          </div>
+        </div>
+      </div>
+      <div id="footer">
+        <p>
+          Copyright &#169; 2015
+          <a href="http://www.apache.org/";>The Apache Software Foundation</a>.
+          Site powered by <a href="http://maven.apache.org/";>Apache Maven</a>. 
+          Search powered by
+          <a href="http://www.lucidimagination.com";>Lucid Imagination</a>
+          and <a href="http://sematext.com";>Sematext</a>.
+          <br/>
+          Apache Tika, Tika, Apache, the Apache feather logo, and the Apache
+          Tika project logo are trademarks of The Apache Software Foundation.
+        </p>
+      </div>
+    </div>
+  </body>
+</html>

Copied: tika/site/src/site/apt/1.10/configuring.apt (from r1693715, 
tika/site/src/site/apt/1.9/configuring.apt)
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/configuring.apt?p2=tika/site/src/site/apt/1.10/configuring.apt&p1=tika/site/src/site/apt/1.9/configuring.apt&r1=1693715&r2=1693752&rev=1693752&view=diff
==============================================================================
--- tika/site/src/site/apt/1.9/configuring.apt (original)
+++ tika/site/src/site/apt/1.10/configuring.apt Sat Aug  1 19:49:31 2015
@@ -31,21 +31,33 @@ Configuring Tika
 
 * {Configuring Parsers}
 
-~~ TODO Add more on in 1.10, which has more support
+    Through the Tika Config xml, it is possible to have a high degree of 
control
+    over which parsers are or aren't used, in what order of preferences etc. 
It 
+    is also possible to override just certain parts, to (for example) have 
"default
+    except for PDF".
+
+    Currently, it is only possible to have a single parser run against a 
document.
+    There is on-going discussion around fallback parsers and combining the 
output
+    of multiple parsers running on a document, but none of these are available 
yet.
+
+    To override some parser certain default behaviours, include the {{{ 
DefaultParser }}}
+    in your configuration, with excludes, then add other parser definitions in.
+    To prevent the {{{ DefaultParser }}} (with its auto-discovery) being used, 
+    simply omit it from your config, and list all other parsers you want 
instead.
 
-    In Tika 1.9, there is some support for configuring Parsers in the Tika 
Config 
-    xml. You can provide a custom list of parser to use, in a custom order, 
and you
-    can also force certain mimetypes to be used or not-used for parsers. You 
can do
-    so with Tika Config something like:
+    To override just some default behaviour, you can use a Tika Config 
something
+    like this:
 
 ---
 <?xml version="1.0" encoding="UTF-8"?>
 <properties>
   <parsers>
-    <!-- Default Parser for most things, except for 2 mime types -->
+    <!-- Default Parser for most things, except for 2 mime types, and never
+         use the Executable Parser -->
     <parser class="org.apache.tika.parser.DefaultParser">
       <mime-exclude>image/jpeg</mime-exclude>
       <mime-exclude>application/pdf</mime-exclude>
+      <parser-exclude 
class="org.apache.tika.parser.executable.ExecutableParser"/>
     </parser>
     <!-- Use a different parser for PDF -->
     <parser class="org.apache.tika.parser.EmptyParser">
@@ -55,8 +67,8 @@ Configuring Tika
 </properties>
 ---
 
-    In code, the key classes to use to build up your own custom parser
-    heirarchy are 
+    To configure things in code, the key classes to use to build up your own 
custom 
+    parser heirarchy are 
     
{{{./api/org/apache/tika/parser/DefaultParser.html}org.apache.tika.parser.DefaultParser}},
     
{{{./api/org/apache/tika/parser/CompositeParser.html}org.apache.tika.parser.CompositeParser}}
     and
@@ -64,11 +76,35 @@ Configuring Tika
 
 * {Configuring Detectors}
 
-~~ TODO Add more on in 1.10, which has more support
+    Through the Tika Config xml, it is possible to have a high degree of 
control
+    over which detectors are or aren't used, in what order of preferences etc. 
It 
+    is also possible to override just certain parts, to (for example) have 
"default
+    except for no POIFS Container Detction".
+
+    To override some detector certain default behaviours, include the 
+    {{{ DefaultDetector }}}, with any {{{ detector-exclude }}} entries you 
need,
+    in your configuration, then add other detectors definitions in. To prevent 
+    the {{{ DefaultParser }}} (with its auto-discovery) being used, simply 
omit it 
+    from your config, and list all other detectors you want instead.
 
-    In Tika 1.9, there is limited support for configuring Detectors in the 
Tika Config 
-    xml. You can provide a custom list of detectors to use, in a custom order, 
with
-    Tika Config something like:
+    To override just some default behaviour, you can use a Tika Config 
something
+    like this:
+
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+  <detectors>
+    <!-- All detectors except built-in container ones -->
+    <detector class="org.apache.tika.detect.DefaultDetector">
+      <detector-exclude 
class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+      <detector-exclude 
class="org.apache.tika.parser.microsoft.POIFSContainerDetector"/>
+    </detector>
+  </detectors>
+</properties>
+---
+
+    Or to just only use certain detectors, you can use a Tika Config something
+    like this:
 
 ---
 <?xml version="1.0" encoding="UTF-8"?>
@@ -103,6 +139,9 @@ Configuring Tika
     While the work on that is ongoing, for now you will need to review the
     {{{./api/}Tika Javadocs}} to see how individual Translators are configured.
 
+~~ When Translators can have their parameters configured, mention here about
+~~ specifying which single one to use in the Tika Config XML
+
 * {Using a Tika Configuration XML file}
 
     However you call Tika, the System Property of <<< tika.config >>> is


Reply via email to