Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java Wed May 13 13:49:36 2015 @@ -1,38 +1,38 @@ -/** - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.example; - -import java.io.ByteArrayInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.WriteOutContentHandler; -import org.springframework.context.ApplicationContext; -import org.springframework.context.support.ClassPathXmlApplicationContext; - -import com.google.common.base.Charsets; - -public class SpringExample { - - public static void main(String[] args) throws Exception { - ApplicationContext context = new ClassPathXmlApplicationContext( - new String[] { "org/apache/tika/example/spring.xml" }); - Parser parser = context.getBean("tika", Parser.class); - parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(Charsets.UTF_8)), - new WriteOutContentHandler(System.out), new Metadata(), - new ParseContext()); - } - -} +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.example; + +import java.io.ByteArrayInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.WriteOutContentHandler; +import org.springframework.context.ApplicationContext; +import org.springframework.context.support.ClassPathXmlApplicationContext; + +import com.google.common.base.Charsets; + +public class SpringExample { + + public static void main(String[] args) throws Exception { + ApplicationContext context = new ClassPathXmlApplicationContext( + new String[] { "org/apache/tika/example/spring.xml" }); + Parser parser = context.getBean("tika", Parser.class); + parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(Charsets.UTF_8)), + new WriteOutContentHandler(System.out), new Metadata(), + new ParseContext()); + } + +}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java Wed May 13 13:49:36 2015 @@ -1,218 +1,218 @@ -/** - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.example; - -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Reader; -import java.net.URL; -import java.nio.CharBuffer; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; -import java.util.zip.GZIPInputStream; - -import org.apache.tika.Tika; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.CompositeParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.ParserDecorator; -import org.apache.tika.parser.html.HtmlMapper; -import org.apache.tika.parser.html.HtmlParser; -import org.apache.tika.parser.html.IdentityHtmlMapper; -import org.apache.tika.parser.txt.TXTParser; -import org.apache.tika.parser.xml.XMLParser; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.LinkContentHandler; -import org.apache.tika.sax.TeeContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -public class TIAParsingExample { - - public static String parseToStringExample() throws Exception { - File document = new File("example.doc"); - String content = new Tika().parseToString(document); - System.out.print(content); - return content; - } - - public static void parseToReaderExample() throws Exception { - File document = new File("example.doc"); - Reader reader = new Tika().parse(document); - try { - char[] buffer = new char[1000]; - int n = reader.read(buffer); - while (n != -1) { - System.out.append(CharBuffer.wrap(buffer, 0, n)); - n = reader.read(buffer); - } - } finally { - reader.close(); - } - } - - public static void parseFileInputStream(String filename) throws Exception { - Parser parser = new AutoDetectParser(); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - InputStream stream = new FileInputStream(new File(filename)); - try { - parser.parse(stream, handler, metadata, context); - } finally { - stream.close(); - } - } - - public static void parseURLStream(String address) throws Exception { - Parser parser = new AutoDetectParser(); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - InputStream stream = new GZIPInputStream(new URL(address).openStream()); - try { - parser.parse(stream, handler, metadata, context); - } finally { - stream.close(); - } - } - - public static void parseTikaInputStream(String filename) throws Exception { - Parser parser = new AutoDetectParser(); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - InputStream stream = TikaInputStream.get(new File(filename)); - try { - parser.parse(stream, handler, metadata, context); - } finally { - stream.close(); - } - } - - public static File tikaInputStreamGetFile(String filename) throws Exception { - InputStream stream = TikaInputStream.get(new File(filename)); - try { - TikaInputStream tikaInputStream = TikaInputStream.get(stream); - File file = tikaInputStream.getFile(); - return file; - } finally { - stream.close(); - } - } - - public static void useHtmlParser() throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - Parser parser = new HtmlParser(); - parser.parse(stream, handler, metadata, context); - } - - public static void useCompositeParser() throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - ContentHandler handler = new DefaultHandler(); - ParseContext context = new ParseContext(); - Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); - parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); - parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); - - CompositeParser parser = new CompositeParser(); - parser.setParsers(parsersByType); - parser.setFallback(new TXTParser()); - - Metadata metadata = new Metadata(); - metadata.set(Metadata.CONTENT_TYPE, "text/html"); - parser.parse(stream, handler, metadata, context); - } - - public static void useAutoDetectParser() throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - Parser parser = new AutoDetectParser(); - parser.parse(stream, handler, metadata, context); - } - - public static void testTeeContentHandler(String filename) throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - Parser parser = new AutoDetectParser(); - LinkContentHandler linkCollector = new LinkContentHandler(); - OutputStream output = new FileOutputStream(new File(filename)); - try { - ContentHandler handler = new TeeContentHandler( - new BodyContentHandler(output), linkCollector); - parser.parse(stream, handler, metadata, context); - } finally { - output.close(); - } - } - - public static void testLocale() throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - Parser parser = new AutoDetectParser(); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.ENGLISH); - parser.parse(stream, handler, metadata, context); - } - - public static void testHtmlMapper() throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - Parser parser = new AutoDetectParser(); - ParseContext context = new ParseContext(); - context.set(HtmlMapper.class, new IdentityHtmlMapper()); - parser.parse(stream, handler, metadata, context); - } - - public static void testCompositeDocument() throws Exception { - InputStream stream = new ByteArrayInputStream(new byte[0]); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - Parser parser = new AutoDetectParser(); - ParseContext context = new ParseContext(); - context.set(Parser.class, new ParserDecorator(parser) { - private static final long serialVersionUID = 4424210691523343833L; - - @Override - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { - // custom processing of the component document - } - }); - parser.parse(stream, handler, metadata, context); - } - -} +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.example; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.Reader; +import java.net.URL; +import java.nio.CharBuffer; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.html.HtmlMapper; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.parser.html.IdentityHtmlMapper; +import org.apache.tika.parser.txt.TXTParser; +import org.apache.tika.parser.xml.XMLParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.LinkContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class TIAParsingExample { + + public static String parseToStringExample() throws Exception { + File document = new File("example.doc"); + String content = new Tika().parseToString(document); + System.out.print(content); + return content; + } + + public static void parseToReaderExample() throws Exception { + File document = new File("example.doc"); + Reader reader = new Tika().parse(document); + try { + char[] buffer = new char[1000]; + int n = reader.read(buffer); + while (n != -1) { + System.out.append(CharBuffer.wrap(buffer, 0, n)); + n = reader.read(buffer); + } + } finally { + reader.close(); + } + } + + public static void parseFileInputStream(String filename) throws Exception { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + InputStream stream = new FileInputStream(new File(filename)); + try { + parser.parse(stream, handler, metadata, context); + } finally { + stream.close(); + } + } + + public static void parseURLStream(String address) throws Exception { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + InputStream stream = new GZIPInputStream(new URL(address).openStream()); + try { + parser.parse(stream, handler, metadata, context); + } finally { + stream.close(); + } + } + + public static void parseTikaInputStream(String filename) throws Exception { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + InputStream stream = TikaInputStream.get(new File(filename)); + try { + parser.parse(stream, handler, metadata, context); + } finally { + stream.close(); + } + } + + public static File tikaInputStreamGetFile(String filename) throws Exception { + InputStream stream = TikaInputStream.get(new File(filename)); + try { + TikaInputStream tikaInputStream = TikaInputStream.get(stream); + File file = tikaInputStream.getFile(); + return file; + } finally { + stream.close(); + } + } + + public static void useHtmlParser() throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + Parser parser = new HtmlParser(); + parser.parse(stream, handler, metadata, context); + } + + public static void useCompositeParser() throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + ContentHandler handler = new DefaultHandler(); + ParseContext context = new ParseContext(); + Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); + parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); + parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); + + CompositeParser parser = new CompositeParser(); + parser.setParsers(parsersByType); + parser.setFallback(new TXTParser()); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + parser.parse(stream, handler, metadata, context); + } + + public static void useAutoDetectParser() throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + Parser parser = new AutoDetectParser(); + parser.parse(stream, handler, metadata, context); + } + + public static void testTeeContentHandler(String filename) throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + Parser parser = new AutoDetectParser(); + LinkContentHandler linkCollector = new LinkContentHandler(); + OutputStream output = new FileOutputStream(new File(filename)); + try { + ContentHandler handler = new TeeContentHandler( + new BodyContentHandler(output), linkCollector); + parser.parse(stream, handler, metadata, context); + } finally { + output.close(); + } + } + + public static void testLocale() throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.ENGLISH); + parser.parse(stream, handler, metadata, context); + } + + public static void testHtmlMapper() throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(HtmlMapper.class, new IdentityHtmlMapper()); + parser.parse(stream, handler, metadata, context); + } + + public static void testCompositeDocument() throws Exception { + InputStream stream = new ByteArrayInputStream(new byte[0]); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(Parser.class, new ParserDecorator(parser) { + private static final long serialVersionUID = 4424210691523343833L; + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // custom processing of the component document + } + }); + parser.parse(stream, handler, metadata, context); + } + +} Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java Wed May 13 13:49:36 2015 @@ -1,47 +1,47 @@ -/** - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.example; - -//JDK imports -import java.io.IOException; -import java.util.Collections; -import java.util.zip.ZipEntry; -import java.util.zip.ZipFile; - -/** - * - * - * Example code listing from Chapter 1. Lists a zip file's entries using JDK's - * standard APIs. - * - */ -public class ZipListFiles { - public static void main(String[] args) throws Exception { - if (args.length > 0) { - for (String file : args) { - System.out.println("Files in " + file + " file:"); - listZipEntries(file); - } - } - } - - public static void listZipEntries(String path) throws IOException { - ZipFile zip = new ZipFile(path); - for (ZipEntry entry : Collections.list(zip.entries())) { - System.out.println(entry.getName()); - } - } - +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.example; + +//JDK imports +import java.io.IOException; +import java.util.Collections; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +/** + * + * + * Example code listing from Chapter 1. Lists a zip file's entries using JDK's + * standard APIs. + * + */ +public class ZipListFiles { + public static void main(String[] args) throws Exception { + if (args.length > 0) { + for (String file : args) { + System.out.println("Files in " + file + " file:"); + listZipEntries(file); + } + } + } + + public static void listZipEntries(String path) throws IOException { + ZipFile zip = new ZipFile(path); + for (ZipEntry entry : Collections.list(zip.entries())) { + System.out.println(entry.getName()); + } + } + } \ No newline at end of file Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java (original) +++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java Wed May 13 13:49:36 2015 @@ -1,52 +1,52 @@ -/** - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.example; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.PrintStream; - -import junit.framework.Assert; - -import org.apache.commons.io.FileUtils; -import org.junit.Test; - -import com.google.common.base.Charsets; - -@SuppressWarnings("deprecation") -public class SimpleTextExtractorTest { - - @Test - public void testSimpleTextExtractor() throws Exception { - String message = - "Hello, World! This is simple UTF-8 text content written" - + " in English to test autodetection of the character" - + " encoding of the input stream."; - ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - - PrintStream out = System.out; - System.setOut(new PrintStream(buffer, true, Charsets.UTF_8.name())); - - File file = new File("target", "test.txt"); - FileUtils.writeStringToFile(file, message); - SimpleTextExtractor.main(new String[] { file.getPath() }); - file.delete(); - - System.setOut(out); - - Assert.assertEquals(message, buffer.toString(Charsets.UTF_8.name()).trim()); - } - -} +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.example; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; + +import junit.framework.Assert; + +import org.apache.commons.io.FileUtils; +import org.junit.Test; + +import com.google.common.base.Charsets; + +@SuppressWarnings("deprecation") +public class SimpleTextExtractorTest { + + @Test + public void testSimpleTextExtractor() throws Exception { + String message = + "Hello, World! This is simple UTF-8 text content written" + + " in English to test autodetection of the character" + + " encoding of the input stream."; + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + + PrintStream out = System.out; + System.setOut(new PrintStream(buffer, true, Charsets.UTF_8.name())); + + File file = new File("target", "test.txt"); + FileUtils.writeStringToFile(file, message); + SimpleTextExtractor.main(new String[] { file.getPath() }); + file.delete(); + + System.setOut(out); + + Assert.assertEquals(message, buffer.toString(Charsets.UTF_8.name()).trim()); + } + +} Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java Wed May 13 13:49:36 2015 @@ -1,80 +1,80 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.parser.pdf; - -import java.io.Serializable; - -import org.apache.tika.exception.AccessPermissionException; -import org.apache.tika.metadata.AccessPermissions; -import org.apache.tika.metadata.Metadata; - -/** - * Checks whether or not a document allows extraction generally - * or extraction for accessibility only. - */ -public class AccessChecker implements Serializable { - - private static final long serialVersionUID = 6492570218190936986L; - - private final boolean needToCheck; - private final boolean allowAccessibility; - - /** - * This constructs an {@link AccessChecker} that - * will not perform any checking and will always return without - * throwing an exception. - * <p> - * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. - */ - public AccessChecker() { - needToCheck = false; - allowAccessibility = true; - } - /** - * This constructs an {@link AccessChecker} that will check - * for whether or not content should be extracted from a document. - * - * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed - */ - public AccessChecker(boolean allowExtractionForAccessibility) { - needToCheck = true; - this.allowAccessibility = allowExtractionForAccessibility; - } - - /** - * Checks to see if a document's content should be extracted based - * on metadata values and the value of {@link #allowAccessibility} in the constructor. - * - * @param metadata - * @throws AccessPermissionException if access is not permitted - */ - public void check(Metadata metadata) throws AccessPermissionException { - if (!needToCheck) { - return; - } - if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { - if (allowAccessibility) { - if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { - return; - } - throw new AccessPermissionException("Content extraction for accessibility is not allowed."); - } - throw new AccessPermissionException("Content extraction is not allowed."); - } - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pdf; + +import java.io.Serializable; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; + +/** + * Checks whether or not a document allows extraction generally + * or extraction for accessibility only. + */ +public class AccessChecker implements Serializable { + + private static final long serialVersionUID = 6492570218190936986L; + + private final boolean needToCheck; + private final boolean allowAccessibility; + + /** + * This constructs an {@link AccessChecker} that + * will not perform any checking and will always return without + * throwing an exception. + * <p> + * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. + */ + public AccessChecker() { + needToCheck = false; + allowAccessibility = true; + } + /** + * This constructs an {@link AccessChecker} that will check + * for whether or not content should be extracted from a document. + * + * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed + */ + public AccessChecker(boolean allowExtractionForAccessibility) { + needToCheck = true; + this.allowAccessibility = allowExtractionForAccessibility; + } + + /** + * Checks to see if a document's content should be extracted based + * on metadata values and the value of {@link #allowAccessibility} in the constructor. + * + * @param metadata + * @throws AccessPermissionException if access is not permitted + */ + public void check(Metadata metadata) throws AccessPermissionException { + if (!needToCheck) { + return; + } + if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { + if (allowAccessibility) { + if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { + return; + } + throw new AccessPermissionException("Content extraction for accessibility is not allowed."); + } + throw new AccessPermissionException("Content extraction is not allowed."); + } + } +} Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Wed May 13 13:49:36 2015 @@ -14,20 +14,20 @@ package org.apache.tika.parser.pdf; * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.pdfbox.util.PDFTextStripper; - -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.util.Locale; -import java.util.Properties; - -/** - * Config for PDFParser. - * + * limitations under the License. + */ + +import org.apache.pdfbox.util.PDFTextStripper; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Locale; +import java.util.Properties; + +/** + * Config for PDFParser. + * * This allows parameters to be set programmatically: * <ol> * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li> @@ -77,14 +77,14 @@ public class PDFParserConfig implements //The character width-based tolerance value used to estimate where spaces in text should be added private Float averageCharTolerance; - //The space width-based tolerance value used to estimate where spaces in text should be added - private Float spacingTolerance; - - private AccessChecker accessChecker; - - public PDFParserConfig() { - init(this.getClass().getResourceAsStream("PDFParser.properties")); - } + //The space width-based tolerance value used to estimate where spaces in text should be added + private Float spacingTolerance; + + private AccessChecker accessChecker; + + public PDFParserConfig() { + init(this.getClass().getResourceAsStream("PDFParser.properties")); + } /** * Loads properties from InputStream and then tries to close InputStream. @@ -136,24 +136,24 @@ public class PDFParserConfig implements setExtractInlineImages( getProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); - setExtractUniqueInlineImagesOnly( - getProp(props.getProperty("extractUniqueInlineImagesOnly"), - getExtractUniqueInlineImagesOnly())); - - boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); - boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); - - if (checkExtractAccessPermission == false) { - //silently ignore the crazy configuration of checkExtractAccessPermission = false, - //but allowExtractionForAccessibility=false - accessChecker = new AccessChecker(); - } else { - accessChecker = new AccessChecker(allowExtractionForAccessibility); - } - } - - /** - * Configures the given pdf2XHTML. + setExtractUniqueInlineImagesOnly( + getProp(props.getProperty("extractUniqueInlineImagesOnly"), + getExtractUniqueInlineImagesOnly())); + + boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); + boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); + + if (checkExtractAccessPermission == false) { + //silently ignore the crazy configuration of checkExtractAccessPermission = false, + //but allowExtractionForAccessibility=false + accessChecker = new AccessChecker(); + } else { + accessChecker = new AccessChecker(allowExtractionForAccessibility); + } + } + + /** + * Configures the given pdf2XHTML. * * @param pdf2XHTML */ @@ -342,20 +342,20 @@ public class PDFParserConfig implements /** * See {@link PDFTextStripper#setSpacingTolerance(float)} - */ - public void setSpacingTolerance(Float spacingTolerance) { - this.spacingTolerance = spacingTolerance; - } - - public void setAccessChecker(AccessChecker accessChecker) { - this.accessChecker = accessChecker; - } - - public AccessChecker getAccessChecker() { - return accessChecker; - } - - private boolean getProp(String p, boolean defaultMissing){ + */ + public void setSpacingTolerance(Float spacingTolerance) { + this.spacingTolerance = spacingTolerance; + } + + public void setAccessChecker(AccessChecker accessChecker) { + this.accessChecker = accessChecker; + } + + public AccessChecker getAccessChecker() { + return accessChecker; + } + + private boolean getProp(String p, boolean defaultMissing){ if (p == null){ return defaultMissing; } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java Wed May 13 13:49:36 2015 @@ -1,117 +1,117 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pkg; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.Set; - -import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -import com.github.junrar.Archive; -import com.github.junrar.exception.RarException; -import com.github.junrar.rarfile.FileHeader; - -/** - * Parser for Rar files. - */ -public class RarParser extends AbstractParser { - private static final long serialVersionUID = 6157727985054451501L; - - private static final Set<MediaType> SUPPORTED_TYPES = Collections - .singleton(MediaType.application("x-rar-compressed")); - - @Override - public Set<MediaType> getSupportedTypes(ParseContext arg0) { - return SUPPORTED_TYPES; - } - - @Override - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) throws IOException, - SAXException, TikaException { - - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - - EmbeddedDocumentExtractor extractor = context.get( - EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); - - TemporaryResources tmp = new TemporaryResources(); - Archive rar = null; - try { - TikaInputStream tis = TikaInputStream.get(stream, tmp); - rar = new Archive(tis.getFile()); - - if (rar.isEncrypted()) { - throw new EncryptedDocumentException(); - } - - //Without this BodyContentHandler does not work - xhtml.element("div", " "); - - FileHeader header = rar.nextFileHeader(); - while (header != null && !Thread.currentThread().isInterrupted()) { - if (!header.isDirectory()) { - InputStream subFile = null; - try { - subFile = rar.getInputStream(header); - - Metadata entrydata = PackageParser.handleEntryMetadata( - "".equals(header.getFileNameW())?header.getFileNameString():header.getFileNameW(), - header.getCTime(), header.getMTime(), - header.getFullUnpackSize(), - xhtml - ); - - if (extractor.shouldParseEmbedded(entrydata)) { - extractor.parseEmbedded(subFile, handler, entrydata, true); - } - } finally { - if (subFile != null) - subFile.close(); - } - } - - header = rar.nextFileHeader(); - } - - } catch (RarException e) { - throw new TikaException("RarParser Exception", e); - } finally { - if (rar != null) - rar.close(); - tmp.close(); - } - - xhtml.endDocument(); - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import com.github.junrar.Archive; +import com.github.junrar.exception.RarException; +import com.github.junrar.rarfile.FileHeader; + +/** + * Parser for Rar files. + */ +public class RarParser extends AbstractParser { + private static final long serialVersionUID = 6157727985054451501L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(MediaType.application("x-rar-compressed")); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext arg0) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + EmbeddedDocumentExtractor extractor = context.get( + EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + TemporaryResources tmp = new TemporaryResources(); + Archive rar = null; + try { + TikaInputStream tis = TikaInputStream.get(stream, tmp); + rar = new Archive(tis.getFile()); + + if (rar.isEncrypted()) { + throw new EncryptedDocumentException(); + } + + //Without this BodyContentHandler does not work + xhtml.element("div", " "); + + FileHeader header = rar.nextFileHeader(); + while (header != null && !Thread.currentThread().isInterrupted()) { + if (!header.isDirectory()) { + InputStream subFile = null; + try { + subFile = rar.getInputStream(header); + + Metadata entrydata = PackageParser.handleEntryMetadata( + "".equals(header.getFileNameW())?header.getFileNameString():header.getFileNameW(), + header.getCTime(), header.getMTime(), + header.getFullUnpackSize(), + xhtml + ); + + if (extractor.shouldParseEmbedded(entrydata)) { + extractor.parseEmbedded(subFile, handler, entrydata, true); + } + } finally { + if (subFile != null) + subFile.close(); + } + } + + header = rar.nextFileHeader(); + } + + } catch (RarException e) { + throw new TikaException("RarParser Exception", e); + } finally { + if (rar != null) + rar.close(); + tmp.close(); + } + + xhtml.endDocument(); + } +} Modified: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties (original) +++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties Wed May 13 13:49:36 2015 @@ -18,8 +18,8 @@ extractAnnotationText true sortByPosition false suppressDuplicateOverlappingText false useNonSequentialParser false -extractAcroFormContent true -extractInlineImages false -extractUniqueInlineImagesOnly true -checkExtractAccessPermission false -allowExtractionForAccessibility true +extractAcroFormContent true +extractInlineImages false +extractUniqueInlineImagesOnly true +checkExtractAccessPermission false +allowExtractionForAccessibility true Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java Wed May 13 13:49:36 2015 @@ -1,246 +1,246 @@ -package org.apache.tika.parser.mock; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static junit.framework.TestCase.assertEquals; -import static junit.framework.TestCase.assertTrue; -import static org.junit.Assert.fail; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.PrintStream; -import java.util.Date; - -import org.apache.tika.TikaTest; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.IOUtils; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.Parser; -import org.junit.Test; - -public class MockParserTest extends TikaTest { - private final static String M = "/test-documents/mock/"; - private final static Parser PARSER = new AutoDetectParser(); - - @Override - public XMLResult getXML(String path, Metadata m) throws Exception { - //note that this is specific to MockParserTest with addition of M to the path! - InputStream is = getResourceAsStream(M+path); - try { - return super.getXML(is, PARSER, m); - } finally { - IOUtils.closeQuietly(is); - } - } - - @Test - public void testExample() throws Exception { - Metadata m = new Metadata(); - PrintStream out = System.out; - PrintStream err = System.err; - ByteArrayOutputStream outBos = new ByteArrayOutputStream(); - ByteArrayOutputStream errBos = new ByteArrayOutputStream(); - PrintStream tmpOut = new PrintStream(outBos, true, IOUtils.UTF_8.toString()); - PrintStream tmpErr = new PrintStream(errBos, true, IOUtils.UTF_8.toString()); - System.setOut(tmpOut); - System.setErr(tmpErr); - try { - assertThrowable("example.xml", m, IOException.class, "not another IOException"); - assertMockParser(m); - } finally { - System.setOut(out); - System.setErr(err); - } - String outString = new String(outBos.toByteArray(), IOUtils.UTF_8); - assertContains("writing to System.out", outString); - - String errString = new String(errBos.toByteArray(), IOUtils.UTF_8); - assertContains("writing to System.err", errString); - - } - - @Test - public void testNothingBad() throws Exception { - Metadata m = new Metadata(); - String content = getXML("nothing_bad.xml", m).xml; - assertEquals("Geoffrey Chaucer", m.get("author")); - assertContains("<p>And bathed every veyne in swich licour,</p>", content); - assertMockParser(m); - } - - @Test - public void testNullPointer() throws Exception { - Metadata m = new Metadata(); - assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception"); - assertMockParser(m); - } - - @Test - public void testNullPointerNoMsg() throws Exception { - Metadata m = new Metadata(); - assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null); - assertMockParser(m); - } - - - @Test - public void testSleep() throws Exception { - long start = new Date().getTime(); - Metadata m = new Metadata(); - String content = getXML("sleep.xml", m).xml; - assertMockParser(m); - long elapsed = new Date().getTime()-start; - //should sleep for at least 3000 - boolean enoughTimeHasElapsed = elapsed > 2000; - assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed); - assertMockParser(m); - } - - @Test - public void testHeavyHang() throws Exception { - long start = new Date().getTime(); - Metadata m = new Metadata(); - - String content = getXML("heavy_hang.xml", m).xml; - assertMockParser(m); - long elapsed = new Date().getTime()-start; - //should sleep for at least 3000 - boolean enoughTimeHasElapsed = elapsed > 2000; - assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed); - assertMockParser(m); - } - - @Test - public void testFakeOOM() throws Exception { - Metadata m = new Metadata(); - assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom"); - assertMockParser(m); - } - - @Test - public void testRealOOM() throws Exception { - //Note: we're not actually testing the diff between fake and real oom - //i.e. by creating child process and setting different -Xmx or - //memory profiling. - Metadata m = new Metadata(); - assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space"); - assertMockParser(m); - } - - @Test - public void testInterruptibleSleep() { - //Without static initialization of the parser, it can take ~1 second after t.start() - //before the parser actually calls parse. This is - //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc. - //This is not thread creation overhead. - ParserRunnable r = new ParserRunnable("sleep_interruptible.xml"); - Thread t = new Thread(r); - t.start(); - long start = new Date().getTime(); - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - //swallow - } - - t.interrupt(); - - try { - t.join(10000); - } catch (InterruptedException e) { - //swallow - } - long elapsed = new Date().getTime()-start; - boolean shortEnough = elapsed < 2000;//the xml file specifies 3000 - assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough); - } - - @Test - public void testNonInterruptibleSleep() { - ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml"); - Thread t = new Thread(r); - t.start(); - long start = new Date().getTime(); - try { - //make sure that the thread has actually started - Thread.sleep(1000); - } catch (InterruptedException e) { - //swallow - } - t.interrupt(); - try { - t.join(20000); - } catch (InterruptedException e) { - //swallow - } - long elapsed = new Date().getTime()-start; - boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000 - assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough); - } - - private class ParserRunnable implements Runnable { - private final String path; - ParserRunnable(String path) { - this.path = path; - } - @Override - public void run() { - Metadata m = new Metadata(); - try { - getXML(path, m); - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - assertMockParser(m); - } - } - } - - private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) { - - try { - getXML(path, m); - } catch (Throwable t) { - //if this is a throwable wrapped in a TikaException, use the cause - if (t instanceof TikaException && t.getCause() != null) { - t = t.getCause(); - } - if (! (t.getClass().isAssignableFrom(expected))){ - fail(t.getClass() +" is not assignable from "+expected); - } - if (message != null) { - assertEquals(message, t.getMessage()); - } - } - } - - private void assertMockParser(Metadata m) { - String[] parsers = m.getValues("X-Parsed-By"); - //make sure that it was actually parsed by mock. - boolean parsedByMock = false; - for (String parser : parsers) { - if (parser.equals("org.apache.tika.parser.mock.MockParser")) { - parsedByMock = true; - break; - } - } - assertTrue("mock parser should have been called", parsedByMock); - } -} +package org.apache.tika.parser.mock; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Date; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +public class MockParserTest extends TikaTest { + private final static String M = "/test-documents/mock/"; + private final static Parser PARSER = new AutoDetectParser(); + + @Override + public XMLResult getXML(String path, Metadata m) throws Exception { + //note that this is specific to MockParserTest with addition of M to the path! + InputStream is = getResourceAsStream(M+path); + try { + return super.getXML(is, PARSER, m); + } finally { + IOUtils.closeQuietly(is); + } + } + + @Test + public void testExample() throws Exception { + Metadata m = new Metadata(); + PrintStream out = System.out; + PrintStream err = System.err; + ByteArrayOutputStream outBos = new ByteArrayOutputStream(); + ByteArrayOutputStream errBos = new ByteArrayOutputStream(); + PrintStream tmpOut = new PrintStream(outBos, true, IOUtils.UTF_8.toString()); + PrintStream tmpErr = new PrintStream(errBos, true, IOUtils.UTF_8.toString()); + System.setOut(tmpOut); + System.setErr(tmpErr); + try { + assertThrowable("example.xml", m, IOException.class, "not another IOException"); + assertMockParser(m); + } finally { + System.setOut(out); + System.setErr(err); + } + String outString = new String(outBos.toByteArray(), IOUtils.UTF_8); + assertContains("writing to System.out", outString); + + String errString = new String(errBos.toByteArray(), IOUtils.UTF_8); + assertContains("writing to System.err", errString); + + } + + @Test + public void testNothingBad() throws Exception { + Metadata m = new Metadata(); + String content = getXML("nothing_bad.xml", m).xml; + assertEquals("Geoffrey Chaucer", m.get("author")); + assertContains("<p>And bathed every veyne in swich licour,</p>", content); + assertMockParser(m); + } + + @Test + public void testNullPointer() throws Exception { + Metadata m = new Metadata(); + assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception"); + assertMockParser(m); + } + + @Test + public void testNullPointerNoMsg() throws Exception { + Metadata m = new Metadata(); + assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null); + assertMockParser(m); + } + + + @Test + public void testSleep() throws Exception { + long start = new Date().getTime(); + Metadata m = new Metadata(); + String content = getXML("sleep.xml", m).xml; + assertMockParser(m); + long elapsed = new Date().getTime()-start; + //should sleep for at least 3000 + boolean enoughTimeHasElapsed = elapsed > 2000; + assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed); + assertMockParser(m); + } + + @Test + public void testHeavyHang() throws Exception { + long start = new Date().getTime(); + Metadata m = new Metadata(); + + String content = getXML("heavy_hang.xml", m).xml; + assertMockParser(m); + long elapsed = new Date().getTime()-start; + //should sleep for at least 3000 + boolean enoughTimeHasElapsed = elapsed > 2000; + assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed); + assertMockParser(m); + } + + @Test + public void testFakeOOM() throws Exception { + Metadata m = new Metadata(); + assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom"); + assertMockParser(m); + } + + @Test + public void testRealOOM() throws Exception { + //Note: we're not actually testing the diff between fake and real oom + //i.e. by creating child process and setting different -Xmx or + //memory profiling. + Metadata m = new Metadata(); + assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space"); + assertMockParser(m); + } + + @Test + public void testInterruptibleSleep() { + //Without static initialization of the parser, it can take ~1 second after t.start() + //before the parser actually calls parse. This is + //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc. + //This is not thread creation overhead. + ParserRunnable r = new ParserRunnable("sleep_interruptible.xml"); + Thread t = new Thread(r); + t.start(); + long start = new Date().getTime(); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + //swallow + } + + t.interrupt(); + + try { + t.join(10000); + } catch (InterruptedException e) { + //swallow + } + long elapsed = new Date().getTime()-start; + boolean shortEnough = elapsed < 2000;//the xml file specifies 3000 + assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough); + } + + @Test + public void testNonInterruptibleSleep() { + ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml"); + Thread t = new Thread(r); + t.start(); + long start = new Date().getTime(); + try { + //make sure that the thread has actually started + Thread.sleep(1000); + } catch (InterruptedException e) { + //swallow + } + t.interrupt(); + try { + t.join(20000); + } catch (InterruptedException e) { + //swallow + } + long elapsed = new Date().getTime()-start; + boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000 + assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough); + } + + private class ParserRunnable implements Runnable { + private final String path; + ParserRunnable(String path) { + this.path = path; + } + @Override + public void run() { + Metadata m = new Metadata(); + try { + getXML(path, m); + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + assertMockParser(m); + } + } + } + + private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) { + + try { + getXML(path, m); + } catch (Throwable t) { + //if this is a throwable wrapped in a TikaException, use the cause + if (t instanceof TikaException && t.getCause() != null) { + t = t.getCause(); + } + if (! (t.getClass().isAssignableFrom(expected))){ + fail(t.getClass() +" is not assignable from "+expected); + } + if (message != null) { + assertEquals(message, t.getMessage()); + } + } + } + + private void assertMockParser(Metadata m) { + String[] parsers = m.getValues("X-Parsed-By"); + //make sure that it was actually parsed by mock. + boolean parsedByMock = false; + for (String parser : parsers) { + if (parser.equals("org.apache.tika.parser.mock.MockParser")) { + parsedByMock = true; + break; + } + } + assertTrue("mock parser should have been called", parsedByMock); + } +} Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1679211&r1=1679210&r2=1679211&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Wed May 13 13:49:36 2015 @@ -1,137 +1,137 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pdf; - - -import static org.junit.Assert.assertTrue; - -import org.apache.tika.exception.AccessPermissionException; -import org.apache.tika.metadata.AccessPermissions; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.PropertyTypeException; -import org.junit.Test; - -public class AccessCheckerTest { - - @Test - public void testLegacy() throws AccessPermissionException{ - - Metadata m = getMetadata(false, false); - //legacy behavior; don't bother checking - AccessChecker checker = new AccessChecker(); - checker.check(m); - assertTrue("no exception", true); - - m = getMetadata(false, true); - assertTrue("no exception", true); - checker.check(m); - - m = getMetadata(true, true); - assertTrue("no exception", true); - checker.check(m); - } - - @Test - public void testNoExtraction() { - - Metadata m = null; - //allow nothing - AccessChecker checker = new AccessChecker(false); - boolean ex = false; - try { - m = getMetadata(false, false); - checker.check(m); - } catch (AccessPermissionException e) { - ex = true; - } - assertTrue("correct exception with no extraction, no extract for accessibility", ex); - ex = false; - try { - //document allows extraction for accessibility - m = getMetadata(false, true); - checker.check(m); - } catch (AccessPermissionException e) { - //but application is not an accessibility application - ex = true; - } - assertTrue("correct exception with no extraction, no extract for accessibility", ex); - } - - @Test - public void testExtractOnlyForAccessibility() throws AccessPermissionException { - Metadata m = getMetadata(false, true); - //allow accessibility - AccessChecker checker = new AccessChecker(true); - checker.check(m); - assertTrue("no exception", true); - boolean ex = false; - try { - m = getMetadata(false, false); - checker.check(m); - } catch (AccessPermissionException e) { - ex = true; - } - assertTrue("correct exception", ex); - } - - @Test - public void testCrazyExtractNotForAccessibility() throws AccessPermissionException { - Metadata m = getMetadata(true, false); - //allow accessibility - AccessChecker checker = new AccessChecker(true); - checker.check(m); - assertTrue("no exception", true); - - //don't extract for accessibility - checker = new AccessChecker(false); - //if extract content is allowed, the checker shouldn't - //check the value of extract for accessibility - checker.check(m); - assertTrue("no exception", true); - - } - - @Test - public void testCantAddMultiplesToMetadata() { - Metadata m = new Metadata(); - boolean ex = false; - m.add(AccessPermissions.EXTRACT_CONTENT, "true"); - try { - m.add(AccessPermissions.EXTRACT_CONTENT, "false"); - } catch (PropertyTypeException e) { - ex = true; - } - assertTrue("can't add multiple values", ex); - - m = new Metadata(); - ex = false; - m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true"); - try { - m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false"); - } catch (PropertyTypeException e) { - ex = true; - } - assertTrue("can't add multiple values", ex); - } - - private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) { - Metadata m = new Metadata(); - m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction)); - m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(allowExtractionForAccessibility)); - return m; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + + +import static org.junit.Assert.assertTrue; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PropertyTypeException; +import org.junit.Test; + +public class AccessCheckerTest { + + @Test + public void testLegacy() throws AccessPermissionException{ + + Metadata m = getMetadata(false, false); + //legacy behavior; don't bother checking + AccessChecker checker = new AccessChecker(); + checker.check(m); + assertTrue("no exception", true); + + m = getMetadata(false, true); + assertTrue("no exception", true); + checker.check(m); + + m = getMetadata(true, true); + assertTrue("no exception", true); + checker.check(m); + } + + @Test + public void testNoExtraction() { + + Metadata m = null; + //allow nothing + AccessChecker checker = new AccessChecker(false); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + ex = false; + try { + //document allows extraction for accessibility + m = getMetadata(false, true); + checker.check(m); + } catch (AccessPermissionException e) { + //but application is not an accessibility application + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + } + + @Test + public void testExtractOnlyForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(false, true); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception", ex); + } + + @Test + public void testCrazyExtractNotForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(true, false); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + + //don't extract for accessibility + checker = new AccessChecker(false); + //if extract content is allowed, the checker shouldn't + //check the value of extract for accessibility + checker.check(m); + assertTrue("no exception", true); + + } + + @Test + public void testCantAddMultiplesToMetadata() { + Metadata m = new Metadata(); + boolean ex = false; + m.add(AccessPermissions.EXTRACT_CONTENT, "true"); + try { + m.add(AccessPermissions.EXTRACT_CONTENT, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + + m = new Metadata(); + ex = false; + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true"); + try { + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + } + + private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) { + Metadata m = new Metadata(); + m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction)); + m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(allowExtractionForAccessibility)); + return m; + } +}
