Modified: tika/site/publish/2.9.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/examples.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/2.9.1/examples.html (original) +++ tika/site/publish/2.9.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ <p>The <a href="./api/org/apache/tika/Tika.html">Tika facade</a>, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text</p><style type="text/css"> @import url('attached-includes/css/shCoreDefault.css'); </style> -<div id="highlighter_368634" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number54 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseToStringExample() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number55 index1 alt2"><code class="java spaces"> </code><code class="java plain">Tika tika = </code><code class="java keyword">new</code> <code class="java plain">Tika();</code></div><div class="line number56 index2 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ParsingExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><c ode class="java plain">)) {</code></div><div class="line number57 index3 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">tika.parseToString(stream);</code></div><div class="line number58 index4 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number59 index5 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> +<div id="highlighter_970194" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number54 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseToStringExample() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number55 index1 alt2"><code class="java spaces"> </code><code class="java plain">Tika tika = </code><code class="java keyword">new</code> <code class="java plain">Tika();</code></div><div class="line number56 index2 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ParsingExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><c ode class="java plain">)) {</code></div><div class="line number57 index3 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">tika.parseToString(stream);</code></div><div class="line number58 index4 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number59 index5 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> <div class="section"> <h4><a name="Parsing_using_the_Auto-Detect_Parser">Parsing using the Auto-Detect Parser</a></h4> -<p>For more control, you can call the <a href="./api/org/apache/tika/parser/Parser.html">Tika Parsers</a> directly. Most likely, you'll want to start out using the <a href="./api/org/apache/tika/parser/AutoDetectParser.html">Auto-Detect Parser</a>, which automatically figures out what kind of content you have, then calls the appropriate parser for you.</p><div id="highlighter_247972" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number85 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseExample() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number86 index1 alt1"><code class="java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java pla in">AutoDetectParser();</code></div><div class="line number87 index2 alt2"><code class="java spaces"> </code><code class="java plain">BodyContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler();</code></div><div class="line number88 index3 alt1"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number89 index4 alt2"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ParsingExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number90 index5 alt1"><code class="java spaces"> &nb sp; </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number91 index6 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number92 index7 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number93 index8 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> +<p>For more control, you can call the <a href="./api/org/apache/tika/parser/Parser.html">Tika Parsers</a> directly. Most likely, you'll want to start out using the <a href="./api/org/apache/tika/parser/AutoDetectParser.html">Auto-Detect Parser</a>, which automatically figures out what kind of content you have, then calls the appropriate parser for you.</p><div id="highlighter_105078" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number85 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseExample() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number86 index1 alt1"><code class="java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java pla in">AutoDetectParser();</code></div><div class="line number87 index2 alt2"><code class="java spaces"> </code><code class="java plain">BodyContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler();</code></div><div class="line number88 index3 alt1"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number89 index4 alt2"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ParsingExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number90 index5 alt1"><code class="java spaces"> &nb sp; </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number91 index6 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number92 index7 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number93 index8 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> <div class="section"> <h3><a name="Picking_different_output_formats">Picking different output formats</a></h3> <p>With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the <a class="externalLink" href="http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html">ContentHandler</a> you supply to the Parser.</p> <div class="section"> <h4><a name="Parsing_to_Plain_Text">Parsing to Plain Text</a></h4> -<p>By using the <a href="./api/org/apache/tika/sax/BodyContentHandler.html">BodyContentHandler</a>, you can request that Tika return only the content of the document's body as a plain-text string.</p><div id="highlighter_253813" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number47 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseToPlainText() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number48 index1 alt1"><code class="java spaces"> </code><code class="java plain">BodyContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler();</code></div><div class="line number49 index2 alt2"> </div><div class="line number50 index3 alt1"><code class="java space s"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number51 index4 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number52 index5 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number53 index6 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</c ode></div><div class="line number54 index7 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number55 index8 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number56 index9 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> +<p>By using the <a href="./api/org/apache/tika/sax/BodyContentHandler.html">BodyContentHandler</a>, you can request that Tika return only the content of the document's body as a plain-text string.</p><div id="highlighter_688646" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number47 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseToPlainText() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number48 index1 alt1"><code class="java spaces"> </code><code class="java plain">BodyContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler();</code></div><div class="line number49 index2 alt2"> </div><div class="line number50 index3 alt1"><code class="java space s"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number51 index4 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number52 index5 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number53 index6 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</c ode></div><div class="line number54 index7 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number55 index8 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number56 index9 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> <div class="section"> <h4><a name="Parsing_to_XHTML">Parsing to XHTML</a></h4> -<p>By using the <a href="./api/org/apache/tika/sax/ToXMLContentHandler.html">ToXMLContentHandler</a>, you can get the XHTML content of the whole document as a string.</p><div id="highlighter_883684" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number61 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number62 index1 alt1"><code class="java spaces"> </code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler();</code></div><div class="line number63 index2 alt2"> </div><div class="line number64 index3 alt1"><code class="java spaces"> </code><cod e class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number65 index4 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number66 index5 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number67 index6 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number68 in dex7 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number69 index8 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number70 index9 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div> -<p>If you just want the body of the xhtml document, without the header, you can chain together a <a href="./api/org/apache/tika/sax/BodyContentHandler.html">BodyContentHandler</a> and a <a href="./api/org/apache/tika/sax/ToXMLContentHandler.html">ToXMLContentHandler</a> as shown:</p><div id="highlighter_789789" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number76 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseBodyToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number77 index1 alt2"><code class="java spaces"> </code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler(</code></div><div class="line number78 index2 alt 1"><code class="java spaces"> </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler());</code></div><div class="line number79 index3 alt2"> </div><div class="line number80 index4 alt1"><code class="java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number81 index5 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number82 index6 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number83 index7 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number84 index8 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number85 index9 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number86 index10 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> +<p>By using the <a href="./api/org/apache/tika/sax/ToXMLContentHandler.html">ToXMLContentHandler</a>, you can get the XHTML content of the whole document as a string.</p><div id="highlighter_979403" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number61 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number62 index1 alt1"><code class="java spaces"> </code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler();</code></div><div class="line number63 index2 alt2"> </div><div class="line number64 index3 alt1"><code class="java spaces"> </code><cod e class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number65 index4 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number66 index5 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number67 index6 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number68 in dex7 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number69 index8 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number70 index9 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div> +<p>If you just want the body of the xhtml document, without the header, you can chain together a <a href="./api/org/apache/tika/sax/BodyContentHandler.html">BodyContentHandler</a> and a <a href="./api/org/apache/tika/sax/ToXMLContentHandler.html">ToXMLContentHandler</a> as shown:</p><div id="highlighter_840171" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number76 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseBodyToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number77 index1 alt2"><code class="java spaces"> </code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler(</code></div><div class="line number78 index2 alt 1"><code class="java spaces"> </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler());</code></div><div class="line number79 index3 alt2"> </div><div class="line number80 index4 alt1"><code class="java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number81 index5 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number82 index6 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">)) {</code></div><div class="line number83 index7 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number84 index8 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number85 index9 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number86 index10 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> <div class="section"> <h4><a name="Fetching_just_certain_bits_of_the_XHTML">Fetching just certain bits of the XHTML</a></h4> -<p>It possible to execute XPath queries on the parse results, to fetch only certain bits of the XHTML. </p><div id="highlighter_695609" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number92 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseOnePartToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number93 index1 alt2"><code class="java spaces"> </code><code class="java comments">// Only get things under html -> body -> div (class=header)</code></div><div class="line number94 index2 alt1"><code class="java spaces"> </code><code class="java plain">XPathParser xhtmlParser = </code><code class="java keyword">new</code> <code class="java plain">XPathParser(</code><code class="java string"> "xhtml"</code><code class="java plain">, XHTMLContentHandler.XHTML);</code></div><div class="line number95 index3 alt2"><code class="java spaces"> </code><code class="java plain">Matcher divContentMatcher = xhtmlParser.parse(</code><code class="java string">"/xhtml:html/xhtml:body/xhtml:div/descendant::node()"</code><code class="java plain">);</code></div><div class="line number96 index4 alt1"><code class="java spaces"> </code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">MatchingContentHandler(</code></div><div class="line number97 index5 alt2"><code class="java spaces"> </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler(), divContentMatcher);</code></div><div class="line number98 index6 alt1"> </div><div class="line number99 index7 alt2"><code class= "java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number100 index8 alt1"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number101 index9 alt2"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test2.doc"</code><code class="java plain">)) {</code></div><div class="line number102 index10 alt1"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handle r, metadata);</code></div><div class="line number103 index11 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number104 index12 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number105 index13 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> +<p>It possible to execute XPath queries on the parse results, to fetch only certain bits of the XHTML. </p><div id="highlighter_473850" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number92 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseOnePartToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number93 index1 alt2"><code class="java spaces"> </code><code class="java comments">// Only get things under html -> body -> div (class=header)</code></div><div class="line number94 index2 alt1"><code class="java spaces"> </code><code class="java plain">XPathParser xhtmlParser = </code><code class="java keyword">new</code> <code class="java plain">XPathParser(</code><code class="java string"> "xhtml"</code><code class="java plain">, XHTMLContentHandler.XHTML);</code></div><div class="line number95 index3 alt2"><code class="java spaces"> </code><code class="java plain">Matcher divContentMatcher = xhtmlParser.parse(</code><code class="java string">"/xhtml:html/xhtml:body/xhtml:div/descendant::node()"</code><code class="java plain">);</code></div><div class="line number96 index4 alt1"><code class="java spaces"> </code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">MatchingContentHandler(</code></div><div class="line number97 index5 alt2"><code class="java spaces"> </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler(), divContentMatcher);</code></div><div class="line number98 index6 alt1"> </div><div class="line number99 index7 alt2"><code class= "java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number100 index8 alt1"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number101 index9 alt2"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test2.doc"</code><code class="java plain">)) {</code></div><div class="line number102 index10 alt1"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handle r, metadata);</code></div><div class="line number103 index11 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number104 index12 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number105 index13 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> <div class="section"> <h3><a name="Custom_Content_Handlers">Custom Content Handlers</a></h3> <p>The textual output of parsing a file with Tika is returned via the SAX <a class="externalLink" href="http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html">ContentHandler</a> you pass to the parse method. It is possible to customise your parsing by supplying your own ContentHandler which does special things.</p> @@ -141,16 +104,16 @@ <p>By using the <a href="./api/org/apache/tika/sax/PhoneExtractingContentHandler.html">PhoneExtractingContentHandler</a>, you can have any phone numbers found in the textual content of the document extracted and placed into the Metadata object for you.</p></div> <div class="section"> <h4><a name="Streaming_the_plain_text_in_chunks">Streaming the plain text in chunks</a></h4> -<p>Sometimes, you want to chunk the resulting text up, perhaps to output as you go minimising memory use, perhaps to output to HDFS files, or any other reason! With a small custom content handler, you can do that.</p><div id="highlighter_167512" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number113 index0 alt2"><code class="java keyword">public</code> <code class="java plain">List<String> parseToPlainTextChunks() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number114 index1 alt1"><code class="java spaces"> </code><code class="java keyword">final</code> <code class="java plain">List<String> chunks = </code><code class="java keyword">new</code> <code class="java plain">ArrayList<>();</code></div><div class="line number115 index2 alt2"><code c lass="java spaces"> </code><code class="java plain">chunks.add(</code><code class="java string">""</code><code class="java plain">);</code></div><div class="line number116 index3 alt1"><code class="java spaces"> </code><code class="java plain">ContentHandlerDecorator handler = </code><code class="java keyword">new</code> <code class="java plain">ContentHandlerDecorator() {</code></div><div class="line number117 index4 alt2"><code class="java spaces"> </code><code class="java color1">@Override</code></div><div class="line number118 index5 alt1"><code class="java spaces"> </code><code class="java keyword">public</code> <code class="java keyword">void</code> <code class="java plain">characters(</code><code class="java keyword">char</code><code class="java plain">[] ch, </code><code class="java keyword">int</code> <code class="java plain">start, </c ode><code class="java keyword">int</code> <code class="java plain">length) {</code></div><div class="line number119 index6 alt2"><code class="java spaces"> </code><code class="java plain">String lastChunk = chunks.get(chunks.size() - </code><code class="java value">1</code><code class="java plain">);</code></div><div class="line number120 index7 alt1"><code class="java spaces"> </code><code class="java plain">String thisStr = </code><code class="java keyword">new</code> <code class="java plain">String(ch, start, length);</code></div><div class="line number121 index8 alt2"> </div><div class="line number122 index9 alt1"><code class="java spaces"> </code><code class="java keyword">if</code> <code class="java plain">(lastChunk.length() + length > MAXIMUM_TEXT_CHUNK_SIZE) { </code></div><div class="line number123 index10 alt2"><code class="java spaces"> </code><code class="java plain">chunks.add(thisStr);</code></div><div class="line number124 index11 alt1"><code class="java spaces"> </code><code class="java plain">} </code><code class="java keyword">else</code> <code class="java plain">{</code></div><div class="line number125 index12 alt2"><code class="java spaces"> </code><code class="java plain">chunks.set(chunks.size() - </code><code class="java value">1</code><code class="java plain">, lastChunk + thisStr);</code></div><div class="line number126 index13 alt1"><code class="java spaces"> </code><code class="java plain">}</c ode></div><div class="line number127 index14 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number128 index15 alt1"><code class="java spaces"> </code><code class="java plain">};</code></div><div class="line number129 index16 alt2"> </div><div class="line number130 index17 alt1"><code class="java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number131 index18 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number132 index19 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class ="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test2.doc"</code><code class="java plain">)) {</code></div><div class="line number133 index20 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number134 index21 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">chunks;</code></div><div class="line number135 index22 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number136 index23 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> +<p>Sometimes, you want to chunk the resulting text up, perhaps to output as you go minimising memory use, perhaps to output to HDFS files, or any other reason! With a small custom content handler, you can do that.</p><div id="highlighter_220265" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number113 index0 alt2"><code class="java keyword">public</code> <code class="java plain">List<String> parseToPlainTextChunks() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number114 index1 alt1"><code class="java spaces"> </code><code class="java keyword">final</code> <code class="java plain">List<String> chunks = </code><code class="java keyword">new</code> <code class="java plain">ArrayList<>();</code></div><div class="line number115 index2 alt2"><code c lass="java spaces"> </code><code class="java plain">chunks.add(</code><code class="java string">""</code><code class="java plain">);</code></div><div class="line number116 index3 alt1"><code class="java spaces"> </code><code class="java plain">ContentHandlerDecorator handler = </code><code class="java keyword">new</code> <code class="java plain">ContentHandlerDecorator() {</code></div><div class="line number117 index4 alt2"><code class="java spaces"> </code><code class="java color1">@Override</code></div><div class="line number118 index5 alt1"><code class="java spaces"> </code><code class="java keyword">public</code> <code class="java keyword">void</code> <code class="java plain">characters(</code><code class="java keyword">char</code><code class="java plain">[] ch, </code><code class="java keyword">int</code> <code class="java plain">start, </c ode><code class="java keyword">int</code> <code class="java plain">length) {</code></div><div class="line number119 index6 alt2"><code class="java spaces"> </code><code class="java plain">String lastChunk = chunks.get(chunks.size() - </code><code class="java value">1</code><code class="java plain">);</code></div><div class="line number120 index7 alt1"><code class="java spaces"> </code><code class="java plain">String thisStr = </code><code class="java keyword">new</code> <code class="java plain">String(ch, start, length);</code></div><div class="line number121 index8 alt2"> </div><div class="line number122 index9 alt1"><code class="java spaces"> </code><code class="java keyword">if</code> <code class="java plain">(lastChunk.length() + length > MAXIMUM_TEXT_CHUNK_SIZE) { </code></div><div class="line number123 index10 alt2"><code class="java spaces"> </code><code class="java plain">chunks.add(thisStr);</code></div><div class="line number124 index11 alt1"><code class="java spaces"> </code><code class="java plain">} </code><code class="java keyword">else</code> <code class="java plain">{</code></div><div class="line number125 index12 alt2"><code class="java spaces"> </code><code class="java plain">chunks.set(chunks.size() - </code><code class="java value">1</code><code class="java plain">, lastChunk + thisStr);</code></div><div class="line number126 index13 alt1"><code class="java spaces"> </code><code class="java plain">}</c ode></div><div class="line number127 index14 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number128 index15 alt1"><code class="java spaces"> </code><code class="java plain">};</code></div><div class="line number129 index16 alt2"> </div><div class="line number130 index17 alt1"><code class="java spaces"> </code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number131 index18 alt2"><code class="java spaces"> </code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number132 index19 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class ="java plain">(InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test2.doc"</code><code class="java plain">)) {</code></div><div class="line number133 index20 alt2"><code class="java spaces"> </code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number134 index21 alt1"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">chunks;</code></div><div class="line number135 index22 alt2"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number136 index23 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> <div class="section"> <h3><a name="Translation">Translation</a></h3> <p>Tika provides a pluggable Translation system, which allow you to send the results of parsing off to an external system or program to have the text translated into another language.</p> <div class="section"> <h4><a name="Translation_using_the_Microsoft_Translation_API">Translation using the Microsoft Translation API</a></h4> -<p>In order to use the Microsoft Translation API, you need to sign up for a Microsoft account, get an API key, then pass the key to Tika before translating.</p><div id="highlighter_383916" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number23 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String microsoftTranslateToFrench(String text) {</code></div><div class="line number24 index1 alt1"><code class="java spaces"> </code><code class="java plain">MicrosoftTranslator translator = </code><code class="java keyword">new</code> <code class="java plain">MicrosoftTranslator();</code></div><div class="line number25 index2 alt2"><code class="java spaces"> </code><code class="java comments">// Change the id and secret! See <a href="http://msdn.microsoft.com/en-us/library/hh454950.aspx.">http://msdn.microso ft.com/en-us/library/hh454950.aspx.</a></code></div><div class="line number26 index3 alt1"><code class="java spaces"> </code><code class="java plain">translator.setId(</code><code class="java string">"dummy-id"</code><code class="java plain">);</code></div><div class="line number27 index4 alt2"><code class="java spaces"> </code><code class="java plain">translator.setSecret(</code><code class="java string">"dummy-secret"</code><code class="java plain">);</code></div><div class="line number28 index5 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number29 index6 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">translator.translate(text, </code><code class="java string">"fr"</code><code class="java plain">);</code></div><div class= "line number30 index7 alt1"><code class="java spaces"> </code><code class="java plain">} </code><code class="java keyword">catch</code> <code class="java plain">(Exception e) {</code></div><div class="line number31 index8 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java string">"Error while translating."</code><code class="java plain">;</code></div><div class="line number32 index9 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number33 index10 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> +<p>In order to use the Microsoft Translation API, you need to sign up for a Microsoft account, get an API key, then pass the key to Tika before translating.</p><div id="highlighter_733998" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number23 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String microsoftTranslateToFrench(String text) {</code></div><div class="line number24 index1 alt1"><code class="java spaces"> </code><code class="java plain">MicrosoftTranslator translator = </code><code class="java keyword">new</code> <code class="java plain">MicrosoftTranslator();</code></div><div class="line number25 index2 alt2"><code class="java spaces"> </code><code class="java comments">// Change the id and secret! See <a href="http://msdn.microsoft.com/en-us/library/hh454950.aspx.">http://msdn.microso ft.com/en-us/library/hh454950.aspx.</a></code></div><div class="line number26 index3 alt1"><code class="java spaces"> </code><code class="java plain">translator.setId(</code><code class="java string">"dummy-id"</code><code class="java plain">);</code></div><div class="line number27 index4 alt2"><code class="java spaces"> </code><code class="java plain">translator.setSecret(</code><code class="java string">"dummy-secret"</code><code class="java plain">);</code></div><div class="line number28 index5 alt1"><code class="java spaces"> </code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number29 index6 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">translator.translate(text, </code><code class="java string">"fr"</code><code class="java plain">);</code></div><div class= "line number30 index7 alt1"><code class="java spaces"> </code><code class="java plain">} </code><code class="java keyword">catch</code> <code class="java plain">(Exception e) {</code></div><div class="line number31 index8 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java string">"Error while translating."</code><code class="java plain">;</code></div><div class="line number32 index9 alt1"><code class="java spaces"> </code><code class="java plain">}</code></div><div class="line number33 index10 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div> <div class="section"> <h3><a name="Language_Identification">Language Identification</a></h3> -<p>Tika provides support for identifying the language of text, through the <a href="./api/org/apache/tika/language/LanguageIdentifier.html">LanguageIdentifier</a> class.</p><div id="highlighter_404136" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number23 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String identifyLanguage(String text) {</code></div><div class="line number24 index1 alt1"><code class="java spaces"> </code><code class="java plain">LanguageIdentifier identifier = </code><code class="java keyword">new</code> <code class="java plain">LanguageIdentifier(text);</code></div><div class="line number25 index2 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">identifier.getLanguage();</code></div><div class="line number26 index3 alt 1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> +<p>Tika provides support for identifying the language of text, through the <a href="./api/org/apache/tika/language/LanguageIdentifier.html">LanguageIdentifier</a> class.</p><div id="highlighter_722428" class="syntaxhighlighter nogutter java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number23 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String identifyLanguage(String text) {</code></div><div class="line number24 index1 alt1"><code class="java spaces"> </code><code class="java plain">LanguageIdentifier identifier = </code><code class="java keyword">new</code> <code class="java plain">LanguageIdentifier(text);</code></div><div class="line number25 index2 alt2"><code class="java spaces"> </code><code class="java keyword">return</code> <code class="java plain">identifier.getLanguage();</code></div><div class="line number26 index3 alt 1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div> <div class="section"> <h3><a name="Additional_Examples">Additional Examples</a></h3> <p>A number of other examples are also available, including all of the examples from the <a class="externalLink" href="http://manning.com/mattmann/">Tika In Action book</a>. These can all be found in the <a class="externalLink" href="https://svn.apache.org/repos/asf/tika/trunk/tika-example">Tika Example module</a> in SVN.</p></div></div> @@ -873,20 +836,7 @@ </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -900,13 +850,10 @@ </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p>
Modified: tika/site/publish/2.9.1/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/formats.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/2.9.1/formats.html (original) +++ tika/site/publish/2.9.1/formats.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -1382,20 +1345,7 @@ </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -1409,13 +1359,10 @@ </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p> Modified: tika/site/publish/2.9.1/gettingstarted.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/gettingstarted.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/2.9.1/gettingstarted.html (original) +++ tika/site/publish/2.9.1/gettingstarted.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -1014,20 +977,7 @@ curl http://.../document.doc \ </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -1041,13 +991,10 @@ curl http://.../document.doc \ </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p> Modified: tika/site/publish/2.9.1/index.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/index.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/2.9.1/index.html (original) +++ tika/site/publish/2.9.1/index.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -818,20 +781,7 @@ </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -845,13 +795,10 @@ </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p> Modified: tika/site/publish/2.9.1/parser.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/parser.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/2.9.1/parser.html (original) +++ tika/site/publish/2.9.1/parser.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -896,20 +859,7 @@ try { </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -923,13 +873,10 @@ try { </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p> Modified: tika/site/publish/2.9.1/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/parser_guide.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/2.9.1/parser_guide.html (original) +++ tika/site/publish/2.9.1/parser_guide.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -894,20 +857,7 @@ public class HelloParser extends Abstrac </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -921,13 +871,10 @@ public class HelloParser extends Abstrac </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p> Modified: tika/site/publish/3.0.0-BETA/configuring.html URL: http://svn.apache.org/viewvc/tika/site/publish/3.0.0-BETA/configuring.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/3.0.0-BETA/configuring.html (original) +++ tika/site/publish/3.0.0-BETA/configuring.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -885,20 +848,7 @@ Parser autoDetectParser = new AutoDetect </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -912,13 +862,10 @@ Parser autoDetectParser = new AutoDetect </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p> Modified: tika/site/publish/3.0.0-BETA/detection.html URL: http://svn.apache.org/viewvc/tika/site/publish/3.0.0-BETA/detection.html?rev=1916752&r1=1916751&r2=1916752&view=diff ============================================================================== --- tika/site/publish/3.0.0-BETA/detection.html (original) +++ tika/site/publish/3.0.0-BETA/detection.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); </style> <link rel="icon" type="image/png" href="../tikaNoText16.png" /> - <script type="text/javascript"> - function selectProvider(form) { - provider = form.elements['searchProvider'].value; - if (provider == "any") { - if (Math.random() > 0.5) { - provider = "lucid"; - } else { - provider = "sl"; - } - } - if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; - } else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; - } - days = 90; - date = new Date(); - date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); - expires = "; expires=" + date.toGMTString(); - document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { - if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { - cStart=cStart + "searchProvider=".length; - cEnd=document.cookie.indexOf(";", cStart); - if (cEnd==-1) { - cEnd=document.cookie.length; - } - provider = unescape(document.cookie.substring(cStart,cEnd)); - document.forms['searchform'].elements['searchProvider'].value = provider; - } - } - document.forms['searchform'].elements['q'].focus(); - } - </script> </head> - <body onLoad="initProvider();"> + <body> <div id="body"> <div id="banner"> <a href="https://tika.apache.org" id="bannerLeft" title="Apache Tika" @@ -854,20 +817,7 @@ for (InputStream is : myListOfStreams) { </li> </ul> - <div id="search"> - <h5>Search with Apache Solr</h5> - <form action="http://search.lucidimagination.com/p:tika" - method="get" id="searchform"> - <input type="text" id="query" name="q"/> - <select name="searchProvider" id="searchProvider"> - <option value="any">provider</option> - <option value="lucid">Lucid Find</option> - <option value="sl">Search-Lucene</option> - </select> - <input type="submit" id="submit" value="Search" name="Search" - onclick="selectProvider(this.form)"/> - </form> - </div> + <div id="bookpromo"> <h5>Books about Tika</h5> @@ -881,13 +831,10 @@ for (InputStream is : myListOfStreams) { </div> <div id="footer"> <p> - Copyright © 2023 + Copyright © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Site powered by <a href="https://maven.apache.org/">Apache Maven</a>. - Search powered by - <a href="http://www.lucidimagination.com">Lucid Imagination</a> - and <a href="http://sematext.com">Sematext</a>. - <br/> + <br/> Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. </p>