Repository: tika Updated Branches: refs/heads/master b9befb427 -> aa7a0c353
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json new file mode 100644 index 0000000..6ef09de --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json @@ -0,0 +1,5 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog 1,200 120000", + "xmpTPg:NPages":2 +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json new file mode 100644 index 0000000..0e2558b --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json new file mode 100644 index 0000000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { + "Content-Type":"text/plain", + "X-TIKA:embedded_resource_path":"inner.txt", + "X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json new file mode 100644 index 0000000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { + "Content-Type":"text/plain", + "X-TIKA:embedded_resource_path":"inner.txt", + "X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json new file mode 100644 index 0000000..18763d1 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json new file mode 100644 index 0000000..18763d1 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file5_emptyA.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file5_emptyA.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file5_emptyA.pdf.json new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file6_accessEx.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file6_accessEx.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file6_accessEx.pdf.json new file mode 100644 index 0000000..ded29af --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file6_accessEx.pdf.json @@ -0,0 +1 @@ +[{"Content-Type":"application/pdf","X-Parsed-By":["org.apache.tika.parser.DefaultParser","org.apache.tika.parser.pdf.PDFParser"],"X-TIKA:EXCEPTION:runtime":"org.apache.tika.exception.AccessPermissionException: Content extraction is not allowed.\n\tat org.apache.tika.parser.pdf.AccessChecker.check(AccessChecker.java:77)\n\tat org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:147)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:270)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:270)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)\n\tat org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:130)\n\tat org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:123)\n\tat org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:171)\n\tat org.apache.tika.batch.FileResourceConsumer.call(F ileResourceConsumer.java:104)\n\tat org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:44)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\n","access_permission:assemble_document":"false","access_permission:can_modify":"false","access_permission:can_print":"true","access_permission:can_print_degraded":"true","access_permission:extract_content":"false","access_permission:extract_for_accessibility":"true","access_permission:fill_in_form":"false","access_permission:modify_annotations":"false","pdf:encrypted":"true","resourceName":"file3_accessEx","tika:file_ext":"pdf","tika_batch_fs:relative_path": "file3_accessEx","xmpTPg:NPages":"4"}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file7_badJson.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file7_badJson.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file7_badJson.pdf.json new file mode 100644 index 0000000..8cf61da --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file7_badJson.pdf.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":2,100 +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file8_IOEx.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file8_IOEx.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file8_IOEx.pdf.json new file mode 100644 index 0000000..4ecf0e8 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file8_IOEx.pdf.json @@ -0,0 +1 @@ +[{"Content-Length":"479562","Content-Type":"application/pdf","X-Parsed-By":["org.apache.tika.parser.DefaultParser","org.apache.tika.parser.pdf.PDFParser"],"X-TIKA:EXCEPTION:runtime":"java.lang.RuntimeException: java.io.IOException: Value is not an integer: 8546736428538085463808\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser$1.tryNext(PDFStreamParser.java:186)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser$1.hasNext(PDFStreamParser.java:193)\n\tat org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:255)\n\tat org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235)\n\tat org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215)\n\tat org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:456)\n\tat org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:381)\n\tat org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:340)\n\tat org.apache.tika.parser.pdf.PDF 2XHTML.process(PDF2XHTML.java:106)\n\tat org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:148)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:247)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:247)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)\n\tat org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:130)\n\tat org.apache.tika.batch.FileResourceConsumer.parse(FileResourceConsumer.java:410)\n\tat org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:106)\n\tat org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:182)\n\tat org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:115)\n\tat org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:49)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurr ent.Executors$RunnableAdapter.call(Executors.java:471)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.io.IOException: Value is not an integer: 8546736428538085463808\n\tat org.apache.pdfbox.cos.COSNumber.get(COSNumber.java:104)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:350)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser.access$000(PDFStreamParser.java:46)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser$1.tryNext(PDFStreamParser.java:181)\n\t... 24 more\n","access_permission:assemble_document":"true","access_permission:can_modify":"true","access_permission:can_print":"true","access_permission:can_print_degraded":"true","access_permission:extract_content":"true","access_permission:extract_for _accessibility":"true","access_permission:fill_in_form":"true","access_permission:modify_annotations":"true", "resourceName":"file8_IOEx.pdf","tika:file_ext":"pdf","tika_batch_fs:relative_path":"file8_IOEx.pdf"}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file1.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file1.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsB/file1.pdf.json new file mode 100644 index 0000000..cbb51cf --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file1.pdf.json @@ -0,0 +1,2 @@ +[{ "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox jumped the lazy dog aardvark aardvark aardvark bear bear"}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file11_oom.txt.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file11_oom.txt.json b/tika-eval/src/test/resources/test-dirs/extractsB/file11_oom.txt.json new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file12_es.txt.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file12_es.txt.json b/tika-eval/src/test/resources/test-dirs/extractsB/file12_es.txt.json new file mode 100644 index 0000000..0e2558b --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file12_es.txt.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file13_attachANotB.doc.txt ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file13_attachANotB.doc.txt b/tika-eval/src/test/resources/test-dirs/extractsB/file13_attachANotB.doc.txt new file mode 100644 index 0000000..240a94e --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file13_attachANotB.doc.txt @@ -0,0 +1 @@ +the quick brown fox fox fox jumped over the lazy lazy dog http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file2_attachANotB.doc.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file2_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsB/file2_attachANotB.doc.json new file mode 100644 index 0000000..18763d1 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file2_attachANotB.doc.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json b/tika-eval/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json new file mode 100644 index 0000000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { + "Content-Type":"text/plain", + "X-TIKA:embedded_resource_path":"inner.txt", + "X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file4_emptyB.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file4_emptyB.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsB/file4_emptyB.pdf.json new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file5_emptyA.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file5_emptyA.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsB/file5_emptyA.pdf.json new file mode 100644 index 0000000..18763d1 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file5_emptyA.pdf.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file6_accessEx.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file6_accessEx.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsB/file6_accessEx.pdf.json new file mode 100644 index 0000000..ded29af --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file6_accessEx.pdf.json @@ -0,0 +1 @@ +[{"Content-Type":"application/pdf","X-Parsed-By":["org.apache.tika.parser.DefaultParser","org.apache.tika.parser.pdf.PDFParser"],"X-TIKA:EXCEPTION:runtime":"org.apache.tika.exception.AccessPermissionException: Content extraction is not allowed.\n\tat org.apache.tika.parser.pdf.AccessChecker.check(AccessChecker.java:77)\n\tat org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:147)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:270)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:270)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)\n\tat org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:130)\n\tat org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:123)\n\tat org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:171)\n\tat org.apache.tika.batch.FileResourceConsumer.call(F ileResourceConsumer.java:104)\n\tat org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:44)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\n","access_permission:assemble_document":"false","access_permission:can_modify":"false","access_permission:can_print":"true","access_permission:can_print_degraded":"true","access_permission:extract_content":"false","access_permission:extract_for_accessibility":"true","access_permission:fill_in_form":"false","access_permission:modify_annotations":"false","pdf:encrypted":"true","resourceName":"file3_accessEx","tika:file_ext":"pdf","tika_batch_fs:relative_path": "file3_accessEx","xmpTPg:NPages":"4"}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file7_badJson.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file7_badJson.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsB/file7_badJson.pdf.json new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsB/file8_IOEx.pdf.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file8_IOEx.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsB/file8_IOEx.pdf.json new file mode 100644 index 0000000..4ecf0e8 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsB/file8_IOEx.pdf.json @@ -0,0 +1 @@ +[{"Content-Length":"479562","Content-Type":"application/pdf","X-Parsed-By":["org.apache.tika.parser.DefaultParser","org.apache.tika.parser.pdf.PDFParser"],"X-TIKA:EXCEPTION:runtime":"java.lang.RuntimeException: java.io.IOException: Value is not an integer: 8546736428538085463808\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser$1.tryNext(PDFStreamParser.java:186)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser$1.hasNext(PDFStreamParser.java:193)\n\tat org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:255)\n\tat org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235)\n\tat org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215)\n\tat org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:456)\n\tat org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:381)\n\tat org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:340)\n\tat org.apache.tika.parser.pdf.PDF 2XHTML.process(PDF2XHTML.java:106)\n\tat org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:148)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:247)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:247)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)\n\tat org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:130)\n\tat org.apache.tika.batch.FileResourceConsumer.parse(FileResourceConsumer.java:410)\n\tat org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:106)\n\tat org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:182)\n\tat org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:115)\n\tat org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:49)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurr ent.Executors$RunnableAdapter.call(Executors.java:471)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:262)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.io.IOException: Value is not an integer: 8546736428538085463808\n\tat org.apache.pdfbox.cos.COSNumber.get(COSNumber.java:104)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:350)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser.access$000(PDFStreamParser.java:46)\n\tat org.apache.pdfbox.pdfparser.PDFStreamParser$1.tryNext(PDFStreamParser.java:181)\n\t... 24 more\n","access_permission:assemble_document":"true","access_permission:can_modify":"true","access_permission:can_print":"true","access_permission:can_print_degraded":"true","access_permission:extract_content":"true","access_permission:extract_for _accessibility":"true","access_permission:fill_in_form":"true","access_permission:modify_annotations":"true", "resourceName":"file8_IOEx.pdf","tika:file_ext":"pdf","tika_batch_fs:relative_path":"file8_IOEx.pdf"}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file1.pdf ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file1.pdf b/tika-eval/src/test/resources/test-dirs/raw_input/file1.pdf new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file1.pdf @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file11_oom.txt ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file11_oom.txt b/tika-eval/src/test/resources/test-dirs/raw_input/file11_oom.txt new file mode 100644 index 0000000..d3cf2f9 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file11_oom.txt @@ -0,0 +1,2 @@ +dummy +dummy \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file2_attachANotB.doc ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file2_attachANotB.doc b/tika-eval/src/test/resources/test-dirs/raw_input/file2_attachANotB.doc new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file2_attachANotB.doc @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file3_attachBNotA.doc ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file3_attachBNotA.doc b/tika-eval/src/test/resources/test-dirs/raw_input/file3_attachBNotA.doc new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file3_attachBNotA.doc @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file4_emptyB.pdf ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file4_emptyB.pdf b/tika-eval/src/test/resources/test-dirs/raw_input/file4_emptyB.pdf new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file4_emptyB.pdf @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file5_emptyA.pdf ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file5_emptyA.pdf b/tika-eval/src/test/resources/test-dirs/raw_input/file5_emptyA.pdf new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file5_emptyA.pdf @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file6_accessEx.pdf ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file6_accessEx.pdf b/tika-eval/src/test/resources/test-dirs/raw_input/file6_accessEx.pdf new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file6_accessEx.pdf @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file7_badJson.pdf ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file7_badJson.pdf b/tika-eval/src/test/resources/test-dirs/raw_input/file7_badJson.pdf new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file7_badJson.pdf @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file8_IOEx.pdf ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file8_IOEx.pdf b/tika-eval/src/test/resources/test-dirs/raw_input/file8_IOEx.pdf new file mode 100644 index 0000000..ef9ddba --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file8_IOEx.pdf @@ -0,0 +1,13 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +dummy source file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/raw_input/file9_noextract.txt ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/raw_input/file9_noextract.txt b/tika-eval/src/test/resources/test-dirs/raw_input/file9_noextract.txt new file mode 100644 index 0000000..5c3118d --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/raw_input/file9_noextract.txt @@ -0,0 +1 @@ +dummy file