This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 59608e69b TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted 59608e69b is described below commit 59608e69bdaeb8a8151e1e9f27b1ef7c3030288b Author: tallison <talli...@apache.org> AuthorDate: Thu Mar 21 17:19:37 2024 -0400 TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted --- .../AbstractEmbeddedDocumentByteStore.java | 3 +- .../extractor/BasicEmbeddedDocumentByteStore.java | 16 ++-- .../tika/extractor/EmbeddedDocumentByteStore.java | 5 +- .../tika/extractor/EmbeddedDocumentUtil.java | 2 +- .../ParsingEmbeddedDocumentExtractor.java | 40 +++++++-- .../ParsingEmbeddedDocumentExtractorFactory.java | 22 ++++- .../org/apache/tika/io/BoundedInputStream.java | 4 + .../java/org/apache/tika/pipes/PipesServer.java | 5 +- .../extractor/EmbeddedDocumentBytesConfig.java | 6 +- .../extractor/EmbeddedDocumentEmitterStore.java | 9 +- .../org/apache/tika/pipes/PipesServerTest.java | 58 ++++++++++++- .../apache/tika/pipes/TIKA-4207-limit-bytes.xml | 34 ++++++++ .../parser/microsoft/pst/OutlookPSTParserTest.java | 2 +- .../apache/tika/parser/pdf/PDFRenderingTest.java | 2 +- .../apache/tika/server/standard/TikaPipesTest.java | 97 +++++++++++++++++++++- 15 files changed, 270 insertions(+), 35 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java index 214c2ab4e..15b26451a 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java @@ -17,6 +17,7 @@ package org.apache.tika.extractor; import java.io.IOException; +import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Locale; @@ -57,7 +58,7 @@ public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocum } @Override - public void add(int id, Metadata metadata, byte[] bytes) throws IOException { + public void add(int id, Metadata metadata, InputStream bytes) throws IOException { ids.add(id); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java index b41285eb0..d3aeb4507 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java @@ -17,9 +17,13 @@ package org.apache.tika.extractor; import java.io.IOException; +import java.io.InputStream; import java.util.HashMap; import java.util.Map; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.UnsynchronizedBufferedInputStream; + import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; @@ -30,13 +34,15 @@ public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByte } //this won't scale, but let's start fully in memory for now; Map<Integer, byte[]> docBytes = new HashMap<>(); - public void add(int id, Metadata metadata, byte[] bytes) throws IOException { - super.add(id, metadata, bytes); - docBytes.put(id, bytes); + @Override + public void add(int id, Metadata metadata, InputStream is) throws IOException { + super.add(id, metadata, is); + docBytes.put(id, IOUtils.toByteArray(is)); } - public byte[] getDocument(int id) { - return docBytes.get(id); + @Override + public InputStream getDocument(int id) throws IOException { + return new UnsynchronizedBufferedInputStream.Builder().setByteArray(docBytes.get(id)).get(); } @Override diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java index ad1bb81f3..8e1e8e325 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java @@ -18,15 +18,16 @@ package org.apache.tika.extractor; import java.io.Closeable; import java.io.IOException; +import java.io.InputStream; import java.util.List; import org.apache.tika.metadata.Metadata; public interface EmbeddedDocumentByteStore extends Closeable { //we need metadata for the emitter store...can we get away without it? - void add(int id, Metadata metadata, byte[] bytes) throws IOException; + void add(int id, Metadata metadata, InputStream inputStream) throws IOException; - byte[] getDocument(int id); + InputStream getDocument(int id) throws IOException; List<Integer> getIds(); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index d6e2c28a8..99a3f3921 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -92,7 +92,7 @@ public class EmbeddedDocumentUtil implements Serializable { context.set(Parser.class, new AutoDetectParser(tikaConfig)); } } - EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context); + EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context, 0); context.set(EmbeddedDocumentExtractor.class, ex); return ex; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index ee15c1e22..97cf5b57f 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -35,6 +35,7 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -45,7 +46,6 @@ import org.apache.tika.parser.ParseRecord; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; -import org.apache.tika.utils.ExceptionUtils; /** * Helper class for parsers of package archives or other compound document @@ -68,8 +68,12 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; - public ParsingEmbeddedDocumentExtractor(ParseContext context) { + private long bytesExtracted = 0; + private final long maxEmbeddedBytesForExtraction; + + public ParsingEmbeddedDocumentExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) { this.context = context; + this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; } public boolean shouldParseEmbedded(Metadata metadata) { @@ -139,6 +143,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { + //TODO -- improve the efficiency of this so that we're not + //literally writing out a file per request Path p = stream.getPath(); try { parse(stream, handler, metadata); @@ -157,7 +163,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract private void storeEmbeddedBytes(Path p, Metadata metadata) { if (! embeddedBytesSelector.select(metadata)) { if (LOGGER.isDebugEnabled()) { - LOGGER.debug("skipping embedded bytes {} {}", + LOGGER.debug("skipping embedded bytes {} <-> {}", metadata.get(Metadata.CONTENT_TYPE), metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); } @@ -166,12 +172,30 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract EmbeddedDocumentByteStore embeddedDocumentByteStore = context.get(EmbeddedDocumentByteStore.class); int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID); - - try { - embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p)); + try (InputStream is = Files.newInputStream(p)) { + if (bytesExtracted >= maxEmbeddedBytesForExtraction) { + throw new IOException("Bytes extracted (" + bytesExtracted + + ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")"); + } + long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted; + + try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) { + embeddedDocumentByteStore.add(id, metadata, boundedIs); + bytesExtracted += boundedIs.getPos(); + if (boundedIs.hasHitBound()) { + throw new IOException("Bytes extracted (" + bytesExtracted + + ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " + + "bytes"); + } + } } catch (IOException e) { - metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, - ExceptionUtils.getStackTrace(e)); + LOGGER.warn("problem writing out embedded bytes", e); + //info in metadata doesn't actually make it back to the metadata list + //because we're filtering and cloning the metadata at the end of the parse + //which happens before we try to copy out the files. + //TODO fix this + //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, + // ExceptionUtils.getStackTrace(e)); } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java index 7632ed49c..fd8cf54b1 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Set; import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -33,6 +34,7 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET; private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; + private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 1024l;//10GB @Field public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; @@ -65,15 +67,33 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument } + /** + * Total number of bytes to write out. A good zip bomb may contain petabytes + * compressed into a few kb. Make sure that you can't fill up a disk! + * + * This does not include the container file in the count of bytes written out. + * This only counts the lengths of the embedded files. + * + * @param maxEmbeddedBytesForExtraction + */ + @Field + public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException { + if (maxEmbeddedBytesForExtraction < 0) { + throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0"); + } + this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; + } @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext); + ParsingEmbeddedDocumentExtractor ex = + new ParsingEmbeddedDocumentExtractor(parseContext, maxEmbeddedBytesForExtraction); ex.setWriteFileNameToContent(writeFileNameToContent); ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector()); return ex; } + private EmbeddedBytesSelector createEmbeddedBytesSelector() { if (embeddedBytesIncludeMimeTypes.size() == 0 && embeddedBytesExcludeMimeTypes.size() == 0 && diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java index a80009cd2..31290cc1a 100644 --- a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java @@ -147,4 +147,8 @@ public class BoundedInputStream extends InputStream { public long transferTo(OutputStream out) throws IOException { return in.transferTo(out); } + + public long getPos() { + return pos; + } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index 851805d06..5cc22d378 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -546,6 +546,7 @@ public class PipesServer implements Runnable { return parseContext; } + //TODO: clean this up. if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) { parseContext.set(EmbeddedDocumentByteStore.class, new EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(), @@ -678,8 +679,8 @@ public class PipesServer implements Runnable { t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) { EmbeddedDocumentByteStore embeddedDocumentByteStore = parseContext.get(EmbeddedDocumentByteStore.class); - try { - embeddedDocumentByteStore.add(0, metadata, Files.readAllBytes(tis.getPath())); + try (InputStream is = Files.newInputStream(tis.getPath())) { + embeddedDocumentByteStore.add(0, metadata, is); } catch (IOException e) { LOG.warn("problem reading source file into embedded document byte store", e); } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java index 42538ff80..66b7321ac 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java @@ -44,11 +44,7 @@ public class EmbeddedDocumentBytesConfig implements Serializable { } } private final boolean extractEmbeddedDocumentBytes; - //TODO -- add these at some point - /* - private Set<String> includeMimeTypes = new HashSet<>(); - private Set<String> excludeMimeTypes = new HashSet<>(); - */ + private int zeroPadName = 0; private SUFFIX_STRATEGY suffixStrategy = SUFFIX_STRATEGY.NONE; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java index 915b44d44..5d09cfe18 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java @@ -18,9 +18,9 @@ package org.apache.tika.pipes.extractor; import java.io.Closeable; import java.io.IOException; +import java.io.InputStream; import org.apache.commons.io.IOExceptionWithCause; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.AbstractEmbeddedDocumentByteStore; @@ -53,20 +53,19 @@ public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteSt } @Override - public void add(int id, Metadata metadata, byte[] bytes) throws IOException { + public void add(int id, Metadata metadata, InputStream inputStream) throws IOException { //intentionally do not call super.add, because we want the ids list to be empty String emitKey = getEmitKey(containerEmitKey.getEmitKey(), id, embeddedDocumentBytesConfig, metadata); - try { - emitter.emit(emitKey, new UnsynchronizedByteArrayInputStream(bytes), METADATA); + emitter.emit(emitKey, inputStream, METADATA); } catch (TikaEmitterException e) { throw new IOExceptionWithCause(e); } } @Override - public byte[] getDocument(int id) { + public InputStream getDocument(int id) { throw new UnsupportedOperationException("this is emit only."); } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java index 857bf485f..6f55e5d11 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java @@ -116,8 +116,10 @@ public class PipesServerTest extends TikaTest { parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); assertEquals(2, parseData.metadataList.size()); - byte[] bytes0 = parseData.getEmbeddedDocumentByteStore().getDocument(0); - byte[] bytes1 = parseData.getEmbeddedDocumentByteStore().getDocument(1); + byte[] bytes0 = + IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0)); + byte[] bytes1 = + IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1)); assertContains("is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8)); @@ -127,4 +129,56 @@ public class PipesServerTest extends TikaTest { assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a", parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256")); } + + @Test + public void testEmbeddedStreamEmitterLimitBytes() throws Exception { + Path tmp = Paths.get("/home/tallison/Desktop/tmp"); + if (Files.isDirectory(tmp)) { + FileUtils.deleteDirectory(tmp.toFile()); + } + Files.createDirectories(tmp); + Path tikaConfig = tmp.resolve("tika-config.xml"); + + String xml = IOUtils.toString( + PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"), + StandardCharsets.UTF_8); + xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString()); + Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8)); + + Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"), + tmp.resolve("mock.xml")); + + PipesServer pipesServer = new PipesServer(tikaConfig, + new UnsynchronizedByteArrayInputStream(new byte[0]), + new PrintStream(new UnsynchronizedByteArrayOutputStream(), true, + StandardCharsets.UTF_8.name()), + -1, 30000, 30000); + + pipesServer.initializeResources(); + EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = + new EmbeddedDocumentBytesConfig(true); + embeddedDocumentBytesConfig.setIncludeOriginal(true); + + FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", + new FetchKey("fs", "mock.xml"), + new EmitKey("", ""), new Metadata(), + HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, + embeddedDocumentBytesConfig); + Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + PipesServer.MetadataListAndEmbeddedBytes + parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + assertEquals(2, parseData.metadataList.size()); + + byte[] bytes0 = + IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0)); + byte[] bytes1 = + IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1)); + + assertContains("is to trigger mock on the embedded", + new String(bytes0, StandardCharsets.UTF_8)); + + assertEquals(10, bytes1.length); + assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a", + parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256")); + } } diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml new file mode 100644 index 000000000..610bad77b --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml @@ -0,0 +1,34 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <autoDetectParserConfig> + <digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory"> + <skipContainerDocument>false</skipContainerDocument> + </digesterFactory> + <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory"> + <writeFileNameToContent>false</writeFileNameToContent> + <maxEmbeddedBytesForExtraction>10</maxEmbeddedBytesForExtraction> + </embeddedDocumentExtractorFactory> + </autoDetectParserConfig> + <fetchers> + <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher"> + <name>fs</name> + <basePath>BASE_PATH</basePath> + </fetcher> + </fetchers> +</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index c95547aee..bcd45460c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -150,7 +150,7 @@ public class OutlookPSTParserTest extends TikaTest { List<Metadata> trackingMetadata = new ArrayList<>(); public EmbeddedTrackingExtrator(ParseContext context) { - super(context); + super(context, 0); } @Override diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java index 08d18b6c1..8503e8bd8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java @@ -112,7 +112,7 @@ public class PDFRenderingTest extends TikaTest { Map<Integer, byte[]> embedded = new HashMap<>(); public RenderCaptureExtractor(ParseContext context) { - super(context); + super(context, 0); } public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index 7f41e065c..110c3f7e8 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -16,20 +16,27 @@ */ package org.apache.tika.server.standard; +import static org.apache.tika.TikaTest.debug; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringWriter; import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import jakarta.ws.rs.core.Response; import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; @@ -49,6 +56,7 @@ import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.HandlerConfig; import org.apache.tika.pipes.emitter.EmitKey; +import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.fetcher.FetcherManager; import org.apache.tika.sax.BasicContentHandlerFactory; @@ -72,6 +80,7 @@ public class TikaPipesTest extends CXFTestBase { private static Path TMP_WORKING_DIR; private static Path TMP_OUTPUT_DIR; private static Path TMP_OUTPUT_FILE; + private static Path TMP_BYTES_DIR; private static Path TIKA_PIPES_LOG4j2_PATH; private static Path TIKA_CONFIG_PATH; private static String TIKA_CONFIG_XML; @@ -81,6 +90,7 @@ public class TikaPipesTest extends CXFTestBase { public static void setUpBeforeClass() throws Exception { Path inputDir = TMP_WORKING_DIR.resolve("input"); TMP_OUTPUT_DIR = TMP_WORKING_DIR.resolve("output"); + TMP_BYTES_DIR = TMP_WORKING_DIR.resolve("bytes"); TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json"); Files.createDirectories(inputDir); @@ -103,7 +113,10 @@ public class TikaPipesTest extends CXFTestBase { "<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" + "<params>" + "<name>fse</name>" + "<basePath>" + TMP_OUTPUT_DIR.toAbsolutePath() + "</basePath>" + "</params>" + - "</emitter>" + "</emitters>" + "<pipes><params><tikaConfig>" + + "</emitter>" + "<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" + + "<params>" + "<name>bytes</name>" + "<basePath>" + + TMP_BYTES_DIR.toAbsolutePath() + "</basePath>" + "</params>" + + "</emitter>" +"</emitters>" + "<pipes><params><tikaConfig>" + ProcessUtils.escapeCommandLine( TIKA_CONFIG_PATH.toAbsolutePath().toString()) + "</tikaConfig><numClients>10</numClients>" + "<forkedJvmArgs>" + @@ -203,4 +216,86 @@ public class TikaPipesTest extends CXFTestBase { assertContains("When in the Course", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } + + @Test + public void testBytes() throws Exception { + EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true); + config.setEmitter("bytes"); + config.setIncludeOriginal(true); + config.setEmbeddedIdPrefix("-"); + config.setZeroPadNameLength(10); + config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING); + + FetchEmitTuple t = + new FetchEmitTuple("myId", new FetchKey("fsf", "test_recursive_embedded.docx"), + new EmitKey("fse", "test_recursive_embedded.docx"), new Metadata(), + new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + HandlerConfig.PARSE_MODE.RMETA, -1, -1, false), + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, config); + StringWriter writer = new StringWriter(); + JsonFetchEmitTuple.toJson(t, writer); + + String getUrl = endPoint + PIPES_PATH; + Response response = + WebClient.create(getUrl).accept("application/json").post(writer.toString()); + assertEquals(200, response.getStatus()); + + List<Metadata> metadataList = null; + try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) { + metadataList = JsonMetadataList.fromJson(reader); + } + assertEquals(12, metadataList.size()); + assertContains("When in the Course", + metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT)); + Map<String, Long> expected = loadExpected(); + Map<String, Long> byteFileNames = getFileNames(TMP_BYTES_DIR); + assertEquals(expected, byteFileNames); + } + + private Map<String, Long> loadExpected() { + Map<String, Long> m = new HashMap<>(); + m.put("test_recursive_embedded.docx-0000000009.txt", 8151l); + m.put("test_recursive_embedded.docx-0000000007.txt", 8l); + m.put("test_recursive_embedded.docx-0000000006.txt", 8l); + m.put("test_recursive_embedded.docx-0000000002.zip", 4827l); + m.put("test_recursive_embedded.docx-0000000001.emf", 4992l); + m.put("test_recursive_embedded.docx-0000000008.zip", 4048l); + m.put("test_recursive_embedded.docx-0000000004.txt", 8l); + m.put("test_recursive_embedded.docx-0000000000.docx", 27082l); + m.put("test_recursive_embedded.docx-0000000003.txt", 8l); + m.put("test_recursive_embedded.docx-0000000011.txt", 7l); + m.put("test_recursive_embedded.docx-0000000005.zip", 4492l); + m.put("test_recursive_embedded.docx-0000000010.zip", 163l); + return m; + } + + private Map<String, Long> getFileNames(Path p) throws Exception { + final Map<String, Long> ret = new HashMap<>(); + Files.walkFileTree(TMP_BYTES_DIR, new FileVisitor<Path>() { + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) + throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { + ret.put(file.getFileName().toString(), Files.size(file)); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) + throws IOException { + return FileVisitResult.CONTINUE; + } + }); + return ret; + } }