This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4506 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 229fe28314c3e254b5107402ea8864d7945cf554 Author: tallison <[email protected]> AuthorDate: Mon Oct 6 09:33:03 2025 -0400 TIKA-4506 -- remove tika-fuzzing module --- CHANGES.txt | 6 +- pom.xml | 1 - tika-bom/pom.xml | 5 - tika-fuzzing/pom.xml | 126 -- .../apache/tika/fuzzing/AutoDetectTransformer.java | 96 -- .../java/org/apache/tika/fuzzing/Transformer.java | 40 - .../java/org/apache/tika/fuzzing/cli/FuzzOne.java | 244 ---- .../org/apache/tika/fuzzing/cli/FuzzingCLI.java | 282 ---- .../apache/tika/fuzzing/cli/FuzzingCLIConfig.java | 108 -- .../tika/fuzzing/exceptions/CantFuzzException.java | 25 - .../apache/tika/fuzzing/general/ByteDeleter.java | 51 - .../apache/tika/fuzzing/general/ByteFlipper.java | 67 - .../apache/tika/fuzzing/general/ByteInjector.java | 76 - .../tika/fuzzing/general/GeneralTransformer.java | 95 -- .../apache/tika/fuzzing/general/SpanSwapper.java | 82 -- .../org/apache/tika/fuzzing/general/Truncator.java | 61 - .../org/apache/tika/fuzzing/pdf/EvilCOSWriter.java | 1486 -------------------- .../apache/tika/fuzzing/pdf/PDFTransformer.java | 62 - .../tika/fuzzing/pdf/PDFTransformerConfig.java | 192 --- .../services/org.apache.tika.fuzzing.Transformer | 17 - tika-fuzzing/src/main/resources/log4j2.xml | 38 - tika-fuzzing/src/test/java/TestFuzzingCLI.java | 64 - tika-fuzzing/src/test/java/TestTransformer.java | 50 - .../test/resources/configs/tika-fuzzing-config.xml | 57 - tika-fuzzing/src/test/resources/log4j2.xml | 42 - .../test/resources/test-documents/heavy_hang.xml | 25 - .../test/resources/test-documents/null_pointer.xml | 25 - .../test/resources/test-documents/system_exit.xml | 25 - 28 files changed, 4 insertions(+), 3444 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 06d73730b..861b8207c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -7,7 +7,7 @@ Release 4.0.0-BETA1 - ??? * Headers are no longer injected into the body/content of MSG files (TIKA-4345). Please open a ticket if you need this behavior across email formats. - * The tika-batch module has been removed (TIKA-4333). + * Remove tika-batch (TIKA-4333). * Remove snaps deployment (TIKA-4502). @@ -15,7 +15,9 @@ Release 4.0.0-BETA1 - ??? * Removed the advanced media module (TIKA-4500). - * Remove the tika-dl module (TIKA-4499). + * Removed the tika-dl module (TIKA-4499). + + * Removed the tika-fuzzing module (TIKA-4506). OTHER CHANGES diff --git a/pom.xml b/pom.xml index 58635e421..9e59f6c55 100644 --- a/pom.xml +++ b/pom.xml @@ -50,7 +50,6 @@ <module>tika-server</module> <module>tika-integration-tests</module> <module>tika-eval</module> - <module>tika-fuzzing</module> <module>tika-translate</module> <module>tika-example</module> <module>tika-java7</module> diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml index d57fad573..f37a5aed7 100644 --- a/tika-bom/pom.xml +++ b/tika-bom/pom.xml @@ -78,11 +78,6 @@ <artifactId>tika-eval-core</artifactId> <version>4.0.0-SNAPSHOT</version> </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-fuzzing</artifactId> - <version>4.0.0-SNAPSHOT</version> - </dependency> <!-- Tika language detection modules --> <dependency> diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml deleted file mode 100644 index a02d32292..000000000 --- a/tika-fuzzing/pom.xml +++ /dev/null @@ -1,126 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> - <parent> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parent</artifactId> - <version>4.0.0-SNAPSHOT</version> - <relativePath>../tika-parent/pom.xml</relativePath> - </parent> - - <artifactId>tika-fuzzing</artifactId> - <name>Apache Tika fuzzing</name> - <url>https://tika.apache.org/</url> - - <modelVersion>4.0.0</modelVersion> - - - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-pipes-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>commons-cli</groupId> - <artifactId>commons-cli</artifactId> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-pkg-module</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-pdf-module</artifactId> - <version>${project.version}</version> - </dependency> - <!-- logging --> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-slf4j2-impl</artifactId> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>jcl-over-slf4j</artifactId> - </dependency> - <!-- test --> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-digest-commons</artifactId> - <version>${project.version}</version> - </dependency> - - <!-- bring in the mock parser --> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <type>test-jar</type> - <scope>test</scope> - </dependency> - </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-checkstyle-plugin</artifactId> - <version>${checkstyle.plugin.version}</version> - <dependencies> - <dependency> - <groupId>com.puppycrawl.tools</groupId> - <artifactId>checkstyle</artifactId> - <version>${puppycrawl.version}</version> - </dependency> - </dependencies> - <executions> - <execution> - <id>validate</id> - <phase>validate</phase> - <configuration> - <configLocation>checkstyle.xml</configLocation> - <inputEncoding>UTF-8</inputEncoding> - <consoleOutput>false</consoleOutput> - <includeTestSourceDirectory>true</includeTestSourceDirectory> - <testSourceDirectories>${project.basedir}/src/test/java</testSourceDirectories> - <violationSeverity>error</violationSeverity> - <failOnViolation>true</failOnViolation> - </configuration> - <goals> - <goal>check</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <configuration> - <archive> - <manifestEntries> - <Automatic-Module-Name>org.apache.tika.fuzzing</Automatic-Module-Name> - </manifestEntries> - </archive> - </configuration> - </plugin> - </plugins> - </build> -</project> \ No newline at end of file diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java deleted file mode 100644 index 05bf5e299..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.tika.config.ServiceLoader; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.Detector; -import org.apache.tika.exception.TikaException; -import org.apache.tika.fuzzing.general.GeneralTransformer; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MediaTypeRegistry; - -public class AutoDetectTransformer implements Transformer { - - private static final ServiceLoader DEFAULT_LOADER = - new ServiceLoader(AutoDetectTransformer.class.getClassLoader()); - - TikaConfig config = TikaConfig.getDefaultConfig(); - MediaTypeRegistry registry = config.getMediaTypeRegistry(); - Detector detector = TikaConfig.getDefaultConfig().getDetector(); - - Transformer fallback = new GeneralTransformer(); - Map<MediaType, Transformer> transformerMap = new HashMap<>(); - - public AutoDetectTransformer() { - this(DEFAULT_LOADER.loadServiceProviders(org.apache.tika.fuzzing.Transformer.class)); - } - - public AutoDetectTransformer(List<Transformer> transformers) { - for (Transformer t : transformers) { - for (MediaType mediaType : t.getSupportedTypes()) { - transformerMap.put(mediaType, t); - } - } - } - - @Override - public Set<MediaType> getSupportedTypes() { - return transformerMap.keySet(); - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException, TikaException { - try (TikaInputStream tis = TikaInputStream.get(is)) { - // Automatically detect the MIME type of the document - Metadata metadata = new Metadata(); - MediaType type = detector.detect(tis, metadata); - Transformer transformer = getTransformer(type); - transformer.transform(tis, os); - } - } - - private Transformer getTransformer(MediaType type) { - if (type == null) { - return fallback; - } - // We always work on the normalised, canonical form - type = registry.normalize(type); - - while (type != null) { - // Try finding a parser for the type - Transformer transformer = transformerMap.get(type); - if (transformer != null) { - return transformer; - } - - // Failing that, try for the parent of the type - type = registry.getSupertype(type); - } - return fallback; - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java deleted file mode 100644 index 57a710fa5..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Set; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.mime.MediaType; - -public interface Transformer { - - /** - * Returns the set of media types supported by this parser when used - * with the given parse context. - * - * @return immutable set of media types - * @since Apache Tika 1.24.1 - */ - Set<MediaType> getSupportedTypes(); - - - void transform(InputStream is, OutputStream os) throws IOException, TikaException; -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java deleted file mode 100644 index 7e55dbf1f..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.cli; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.CommandLineParser; -import org.apache.commons.cli.DefaultParser; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.fuzzing.AutoDetectTransformer; -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.fuzzing.exceptions.CantFuzzException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.utils.ExceptionUtils; - -/** - * Forked process that runs against a single input file - */ -public class FuzzOne { - private static final Logger LOG = LoggerFactory.getLogger(FuzzOne.class); - - static Options OPTIONS; - - static { - //By the time this commandline is parsed, there should be both an extracts and an inputDir - Option extracts = new Option("extracts", true, "directory for extract files"); - extracts.setRequired(true); - - - OPTIONS = new Options().addOption( - Option.builder("i").longOpt("inputFile").desc("input directory for seed files") - .hasArg(true).required(true).get()).addOption( - Option.builder("o").longOpt("outputFile").desc("output file base").hasArg(true) - .required(true).get()).addOption(Option.builder("m").longOpt("timeoutMs") - .desc("timeout in ms -- max time allowed to parse a file").hasArg(true) - .required(true).get()).addOption( - Option.builder("n").desc("thread id (thread number)").hasArg(true).required(true) - .get()).addOption(Option.builder("p").longOpt("perFile") - .desc("number of iterations to run per seed file").hasArg(true).required(true) - .get()).addOption(Option.builder("t").longOpt("maxTransformers") - .desc("maximum number of transformers to run per iteration").hasArg(true) - .required(true).get()).addOption( - Option.builder("r").longOpt("retryId").desc("which retry is this").hasArg(true) - .required(true).get()); - } - - Parser parser = new AutoDetectParser(); - - public static void main(String[] args) throws Exception { - FuzzOneConfig config = FuzzOneConfig.parse(args); - FuzzOne fuzzOne = new FuzzOne(); - fuzzOne.execute(config); - } - - private void execute(FuzzOneConfig config) { - Path src = config.inputFile; - Path targetDir = config.outputFileBase; - AutoDetectTransformer transformer = new AutoDetectTransformer(); - for (int i = 0; i < config.perFileIterations; i++) { - try { - String ext = "-" + config.threadNum + "-" + config.retryNum + "-" + i; - fuzz(ext, src, targetDir, transformer, config.timeoutMs); - } catch (IOException e) { - LOG.warn("problem transforming file", e); - } catch (CantFuzzException e) { - LOG.warn("can't fuzz this file " + src, e); - return; - } catch (TikaException e) { - e.printStackTrace(); - } - } - } - - private void fuzz(String ext, Path src, Path targetFileBase, Transformer transformer, - long timeoutMs) throws IOException, TikaException { - - Path target = - targetFileBase.getParent().resolve(targetFileBase.getFileName().toString() + ext); - - try { - transformFile(transformer, src, target); - } catch (Throwable t) { - LOG.warn("failed to transform: " + src.toString()); - Files.delete(target); - throw t; - } - ExecutorService executor = Executors.newFixedThreadPool(1); - Future<Integer> future = executor.submit(new ParseTask(target)); - - try { - int result = future.get(timeoutMs, TimeUnit.MILLISECONDS); - if (result == 1 && Files.exists(target)) { - LOG.warn("failed to delete target: " + target); - } - } catch (TimeoutException e) { - LOG.warn("timeout exception:" + target); - future.cancel(true); - writeErrFile(target, ".timeout"); - System.exit(1); - } catch (InterruptedException | ExecutionException e) { - LOG.warn("problem parsing " + target, e); - System.exit(1); - } finally { - executor.shutdownNow(); - } - } - - private void writeErrFile(Path target, String ext) { - try { - Path err = target.getParent().resolve(target.getFileName().toString() + ext); - Files.write(err, new byte[0]); - } catch (IOException e) { - LOG.warn("things aren't going right today.", e); - } - } - - private void handleThrowable(Path target, Throwable t) { - - try { - Path errMsg = - target.getParent().resolve(target.getFileName().toString() + ".stacktrace"); - Files.write(errMsg, ExceptionUtils.getStackTrace(t).getBytes(StandardCharsets.UTF_8)); - } catch (IOException e) { - LOG.warn("things aren't going right today.", t); - } - - } - - private void transformFile(Transformer transformer, Path src, Path target) - throws IOException, TikaException { - try (InputStream is = Files.newInputStream(src); - OutputStream os = Files.newOutputStream(target)) { - transformer.transform(is, os); - } - } - - private static class FuzzOneConfig { - int perFileIterations; - int maxTransformers; - int threadNum; - int retryNum; - long timeoutMs; - private Path inputFile; - private Path outputFileBase; - - static FuzzOneConfig parse(String[] args) throws ParseException { - CommandLineParser parser = new DefaultParser(); - CommandLine commandLine = parser.parse(OPTIONS, args); - FuzzOneConfig config = new FuzzOneConfig(); - config.inputFile = Paths.get(commandLine.getOptionValue("i")); - config.outputFileBase = Paths.get(commandLine.getOptionValue("o")); - config.perFileIterations = Integer.parseInt(commandLine.getOptionValue("p")); - config.maxTransformers = Integer.parseInt(commandLine.getOptionValue("t")); - config.threadNum = Integer.parseInt(commandLine.getOptionValue("n")); - config.retryNum = Integer.parseInt(commandLine.getOptionValue("r")); - config.timeoutMs = Integer.parseInt(commandLine.getOptionValue("m")); - return config; - } - - } - - private class ParseTask implements Callable<Integer> { - private final Path target; - - public ParseTask(Path target) { - this.target = target; - } - - /** - * @return 1 if success - * @throws Exception - */ - @Override - public Integer call() throws Exception { - boolean success = false; - try (InputStream is = Files.newInputStream(target)) { - LOG.debug("parsing " + target); - parser.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); - success = true; - } catch (TikaException e) { - if (e.getCause() instanceof RuntimeException) { - //handleThrowable(target, e.getCause()); - success = true; - } else { - success = true; - } - } catch (SAXException | IOException e) { - success = true; - } catch (Throwable t) { - handleThrowable(target, t); - } finally { - if (success) { - try { - Files.delete(target); - } catch (IOException e) { - LOG.warn("couldn't delete: " + target.toAbsolutePath()); - } - } else { - LOG.info("FOUND PROBLEM: " + target); - } - } - return success ? 1 : 0; - } - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java deleted file mode 100644 index baa0e14a2..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.cli; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Locale; -import java.util.UUID; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.FilenameUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.fuzzing.general.ByteDeleter; -import org.apache.tika.fuzzing.general.ByteFlipper; -import org.apache.tika.fuzzing.general.ByteInjector; -import org.apache.tika.fuzzing.general.GeneralTransformer; -import org.apache.tika.fuzzing.general.SpanSwapper; -import org.apache.tika.fuzzing.general.Truncator; -import org.apache.tika.pipes.core.FetchEmitTuple; -import org.apache.tika.pipes.core.PipesConfig; -import org.apache.tika.pipes.core.PipesParser; -import org.apache.tika.pipes.core.PipesResult; -import org.apache.tika.pipes.core.emitter.EmitKey; -import org.apache.tika.pipes.core.fetcher.FetchKey; -import org.apache.tika.pipes.core.fetcher.FetcherManager; -import org.apache.tika.pipes.core.pipesiterator.PipesIterator; - -public class FuzzingCLI { - - private static final Logger LOG = LoggerFactory.getLogger(FuzzingCLI.class); - private static final String TEMP_FETCHER_NAME = "temp"; - private static final String TEMP_EMITTER_NAME = "temp"; - - public static void main(String[] args) throws Exception { - FuzzingCLIConfig config = FuzzingCLIConfig.parse(args); - if (config.getMaxTransformers() == 0) { - LOG.warn("max transformers == 0!"); - } - - FuzzingCLI fuzzingCLI = new FuzzingCLI(); - Files.createDirectories(config.getProblemsDirectory()); - fuzzingCLI.execute(config); - } - - - private void execute(FuzzingCLIConfig config) throws Exception { - ArrayBlockingQueue<FetchEmitTuple> q = new ArrayBlockingQueue(10000); - - PipesConfig pipesConfig = PipesConfig.load(config.getTikaConfig()); - FetcherManager fetcherManager = FetcherManager.load(config.getTikaConfig()); - - int totalThreads = pipesConfig.getNumClients() + 1; - - ExecutorService executorService = Executors.newFixedThreadPool(totalThreads); - ExecutorCompletionService executorCompletionService = - new ExecutorCompletionService(executorService); - PipesIterator pipesIterator = PipesIterator.build(config.getTikaConfig()); - - FileAdder fileAdder = new FileAdder(pipesIterator, q); - executorCompletionService.submit(fileAdder); - try (PipesParser parser = new PipesParser(pipesConfig)) { - - for (int i = 0; i < pipesConfig.getNumClients(); i++) { - executorCompletionService.submit(new Fuzzer(q, config, parser, fetcherManager)); - } - int finished = 0; - while (finished < totalThreads) { - Future<Integer> future = null; - try { - future = executorCompletionService.poll(1, TimeUnit.SECONDS); - if (future != null) { - future.get(); - finished++; - } - LOG.info("Finished thread {} threads of {}", finished, totalThreads); - } catch (InterruptedException | ExecutionException e) { - e.printStackTrace(); - break; - } - } - executorService.shutdown(); - executorService.shutdownNow(); - } - - } - - private static class Fuzzer implements Callable<Integer> { - static AtomicInteger COUNTER = new AtomicInteger(); - static AtomicInteger FUZZED = new AtomicInteger(); - static AtomicInteger SOURCE_FILES = new AtomicInteger(); - private final int threadId = COUNTER.getAndIncrement(); - private final ArrayBlockingQueue<FetchEmitTuple> q; - private final FuzzingCLIConfig config; - - private final PipesParser pipesParser; - - private final Transformer transformer; - - private final FetcherManager fetcherManager; - - public Fuzzer(ArrayBlockingQueue<FetchEmitTuple> q, FuzzingCLIConfig config, - PipesParser pipesParser, FetcherManager fetcherManager) { - this.q = q; - this.config = config; - this.pipesParser = pipesParser; - //TODO - parameterize this - this.transformer = - new GeneralTransformer(config.getMaxTransformers(), new ByteDeleter(), - new ByteFlipper(), new ByteInjector(), new Truncator(), - new SpanSwapper()); - this.fetcherManager = fetcherManager; - } - - @Override - public Integer call() throws Exception { - while (true) { - FetchEmitTuple fetchEmitTuple = q.take(); - if (fetchEmitTuple.equals(PipesIterator.COMPLETED_SEMAPHORE)) { - LOG.debug("Thread " + threadId + " stopping"); - q.put(PipesIterator.COMPLETED_SEMAPHORE); - return 1; - } - int inputFiles = SOURCE_FILES.getAndIncrement(); - if (inputFiles % 100 == 0) { - LOG.info("Processed {} source files", inputFiles); - } - for (int i = 0; i < config.perFileIterations; i++) { - try { - fuzzIt(fetchEmitTuple); - } catch (InterruptedException e) { - throw e; - } catch (Exception e) { - LOG.warn("serious problem with", e); - } - } - } - } - - private void fuzzIt(FetchEmitTuple fetchEmitTuple) - throws IOException, InterruptedException, TikaException { - Path cwd = Files.createTempDirectory("tika-fuzz-"); - try { - Path fuzzedPath = fuzz(fetchEmitTuple, cwd); - Path extract = Files.createTempFile(cwd, "tika-extract-", ".json"); - FetchEmitTuple fuzzedTuple = new FetchEmitTuple(fetchEmitTuple.getId(), - new FetchKey(TEMP_FETCHER_NAME, fuzzedPath.toAbsolutePath().toString()), - new EmitKey(TEMP_EMITTER_NAME, extract.toAbsolutePath().toString())); - int count = FUZZED.getAndIncrement(); - if (count % 100 == 0) { - LOG.info("processed {} fuzzed files", count); - } - boolean tryAgain = true; - int tries = 0; - while (tryAgain && tries < config.getRetries()) { - tries++; - try { - PipesResult result = pipesParser.parse(fuzzedTuple); - tryAgain = handleResult(result.getStatus(), - fetchEmitTuple.getFetchKey().getFetchKey(), fuzzedPath, tries, - config.getRetries()); - } catch (InterruptedException e) { - throw e; - } catch (Exception e) { - tryAgain = handleResult(PipesResult.STATUS.UNSPECIFIED_CRASH, - fetchEmitTuple.getFetchKey().getFetchKey(), fuzzedPath, tries, - config.getRetries()); - } - } - } finally { - try { - FileUtils.deleteDirectory(cwd.toFile()); - } catch (IOException e) { - e.printStackTrace(); - LOG.warn("Couldn't delete " + cwd.toAbsolutePath(), e); - } - } - } - - private Path fuzz(FetchEmitTuple fetchEmitTuple, Path cwd) - throws IOException, TikaException { - Path target = Files.createTempFile(cwd, "tika-fuzz-target-", - "." + FilenameUtils.getExtension(fetchEmitTuple.getFetchKey().getFetchKey())); - try (InputStream is = fetcherManager.getFetcher( - fetchEmitTuple.getFetchKey().getFetcherName()) - .fetch(fetchEmitTuple.getFetchKey().getFetchKey(), fetchEmitTuple.getMetadata(), - fetchEmitTuple.getParseContext())) { - try (OutputStream os = Files.newOutputStream(target)) { - transformer.transform(is, os); - } - } - return target; - } - - private boolean handleResult(PipesResult.STATUS status, String origFetchKey, - Path fuzzedPath, int tries, int maxRetries) - throws IOException { - switch (status) { - case OOM: - case TIMEOUT: - case UNSPECIFIED_CRASH: - if (tries < maxRetries) { - LOG.info("trying again ({} of {}): {}", tries, maxRetries, - status.name()); - return true; - } - Path problemFilePath = getProblemFile(status, origFetchKey); - LOG.info("found a problem {} -> {} : {}", origFetchKey, problemFilePath, - status.name()); - Files.copy(fuzzedPath, problemFilePath); - return false; - default: - //if there wasn't a problem - return false; - } - } - - private Path getProblemFile(PipesResult.STATUS status, String origFetchKey) - throws IOException { - String name = FilenameUtils.getName(origFetchKey) + "-" + UUID.randomUUID(); - Path problemFile = - config.getProblemsDirectory().resolve(status.name().toLowerCase(Locale.US)) - .resolve(name); - Files.createDirectories(problemFile.getParent()); - return problemFile; - } - - } - - private static class FileAdder implements Callable<Integer> { - private final PipesIterator pipesIterator; - private final ArrayBlockingQueue<FetchEmitTuple> queue; - private int added = 0; - - public FileAdder(PipesIterator pipesIterator, ArrayBlockingQueue<FetchEmitTuple> queue) { - this.pipesIterator = pipesIterator; - this.queue = queue; - } - - @Override - public Integer call() throws Exception { - int added = 0; - for (FetchEmitTuple tuple : pipesIterator) { - //hang forever -- should offer and timeout - queue.put(tuple); - added++; - } - queue.put(PipesIterator.COMPLETED_SEMAPHORE); - LOG.info("file adder finished " + added); - return 1; - } - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java deleted file mode 100644 index f06688cf5..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.cli; - -import java.nio.file.Path; -import java.nio.file.Paths; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.CommandLineParser; -import org.apache.commons.cli.DefaultParser; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; - -public class FuzzingCLIConfig { - - private static final int DEFAULT_NUM_ITERATIONS = 100; - - //allow all transformers to operate - private static final int DEFAULT_MAX_TRANSFORMERS = 1; - - private static final int DEFAULT_RETRIES = 1; - - static Options OPTIONS; - - static { - Option problems = new Option("o", "output", true, "directory for problems files"); - problems.setRequired(true); - - - OPTIONS = new Options().addOption(problems) - .addOption(Option.builder("c").longOpt("config").hasArg(true) - .desc("tika config " + - "file with " + - "specs for pipes parser, pipes iterator, fetchers and emitters") - .required(true).get()) - .addOption(Option.builder("p").longOpt("perFile") - .desc("number of iterations to run per seed file").hasArg(true).required(false) - .get()) - .addOption(Option.builder("t").longOpt("maxTransformers") - .desc("maximum number of transformers to run per iteration").hasArg(true) - .required(false).get()) - .addOption(Option.builder("r").longOpt("retries") - .desc("number of times to retry a seed file if there's a catastrophic failure") - .hasArg(true).required(false).get()); - - } - //number of variants tried per file - int perFileIterations = DEFAULT_NUM_ITERATIONS; - //maxTransformers per file - int maxTransformers = DEFAULT_MAX_TRANSFORMERS; - //max time allowed to process each file in milliseconds - long timeoutMS; - //times to retry a seed file after a catastrophic failure - int retries = DEFAULT_RETRIES; - - Path tikaConfig; - - Path problemsDir; - - public static FuzzingCLIConfig parse(String[] args) throws ParseException { - CommandLineParser parser = new DefaultParser(); - CommandLine commandLine = parser.parse(OPTIONS, args); - FuzzingCLIConfig config = new FuzzingCLIConfig(); - config.tikaConfig = Paths.get(commandLine.getOptionValue("c")); - config.problemsDir = Paths.get(commandLine.getOptionValue("o")); - config.retries = - (commandLine.hasOption("r")) ? Integer.parseInt(commandLine.getOptionValue("r")) : - DEFAULT_RETRIES; - config.maxTransformers = (commandLine.hasOption("t")) ? - Integer.parseInt(commandLine.getOptionValue("t")) : DEFAULT_MAX_TRANSFORMERS; - return config; - } - - public Path getProblemsDirectory() { - return problemsDir; - } - - public Path getTikaConfig() { - return tikaConfig; - } - - public int getMaxTransformers() { - return maxTransformers; - } - - public int getPerFileIterations() { - return perFileIterations; - } - - public int getRetries() { - return retries; - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java deleted file mode 100644 index 354082282..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.exceptions; - -import org.apache.tika.exception.TikaException; - -public class CantFuzzException extends TikaException { - public CantFuzzException(String msg) { - super(msg); - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java deleted file mode 100644 index 43ba46bf0..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.general; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Collections; -import java.util.Random; -import java.util.Set; - -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.mime.MediaType; - -public class ByteDeleter implements Transformer { - static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); - Random random = new Random(); - float percentDeleted = 0.01f; - - @Override - public Set<MediaType> getSupportedTypes() { - return SUPPORTED_TYPES; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException { - int c = is.read(); - while (c != -1) { - if (random.nextFloat() >= percentDeleted) { - os.write(c); - } else { - //skip - } - c = is.read(); - } - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java deleted file mode 100644 index b830c7a1f..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.general; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Collections; -import java.util.Random; -import java.util.Set; - -import org.apache.commons.io.IOUtils; - -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.mime.MediaType; - -public class ByteFlipper implements Transformer { - - static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); - //TODO add something about protecting first x bytes? - private final Random random = new Random(); - private float percentCorrupt = 0.01f; - - @Override - public Set<MediaType> getSupportedTypes() { - return SUPPORTED_TYPES; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException { - //TODO -- don't load the full thing into memory - byte[] input = IOUtils.toByteArray(is); - if (input.length == 0) { - return; - } - byte[] singleByte = new byte[1]; - //make sure that there's at least one change, even in short files - int atLeastOneIndex = random.nextInt(input.length); - - for (int i = 0; i < input.length; i++) { - if (random.nextFloat() <= percentCorrupt || i == atLeastOneIndex) { - random.nextBytes(singleByte); - os.write(singleByte[0]); - } else { - os.write(input[i]); - } - } - } - - public void setPercentCorrupt(float percentCorrupt) { - this.percentCorrupt = percentCorrupt; - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java deleted file mode 100644 index b6a5cd061..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.general; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Arrays; -import java.util.Collections; -import java.util.Random; -import java.util.Set; - -import org.apache.commons.io.IOUtils; - -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.mime.MediaType; - -public class ByteInjector implements Transformer { - static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); - Random random = new Random(); - float injectionFrequency = 0.01f; - int maxSpan = 100; - - @Override - public Set<MediaType> getSupportedTypes() { - return SUPPORTED_TYPES; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException { - //TODO -- don't load the full thing into memory - byte[] input = IOUtils.toByteArray(is); - int numInjections = (int) Math.floor((double) injectionFrequency * (double) input.length); - //at least one injection - numInjections = numInjections == 0 ? 1 : numInjections; - int[] starts = new int[numInjections]; - if (numInjections > 1) { - for (int i = 0; i < numInjections; i++) { - starts[i] = random.nextInt(input.length - 1); - } - } else { - starts[0] = 0; - } - Arrays.sort(starts); - int startIndex = 0; - - for (int i = 0; i < input.length; i++) { - os.write(input[i]); - if (startIndex < starts.length && starts[startIndex] == i) { - inject(os); - startIndex++; - } - } - } - - private void inject(OutputStream os) throws IOException { - int len = random.nextInt(maxSpan); - byte[] randBytes = new byte[len]; - random.nextBytes(randBytes); - os.write(randBytes); - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java deleted file mode 100644 index 20ca55ff3..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.general; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Collections; -import java.util.HashSet; -import java.util.Random; -import java.util.Set; - -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.mime.MediaType; - -public class GeneralTransformer implements Transformer { - - private static final Logger LOG = LoggerFactory.getLogger(GeneralTransformer.class); - private final int maxTransforms; - private final Transformer[] transformers; - private final Set<MediaType> supportedTypes; - Random random = new Random(); - - public GeneralTransformer() { - this(new ByteDeleter(), new ByteFlipper(), new ByteInjector(), new Truncator(), - new SpanSwapper()); - } - - public GeneralTransformer(Transformer... transformers) { - this(transformers.length, transformers); - } - - public GeneralTransformer(int maxTransforms, Transformer... transformers) { - this.maxTransforms = (maxTransforms < 0) ? transformers.length : maxTransforms; - this.transformers = transformers; - Set<MediaType> tmpTypes = new HashSet<>(); - for (Transformer transformer : transformers) { - tmpTypes.addAll(transformer.getSupportedTypes()); - } - supportedTypes = Collections.unmodifiableSet(tmpTypes); - } - - @Override - public Set<MediaType> getSupportedTypes() { - return supportedTypes; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException, TikaException { - //used for debugging - if (maxTransforms == 0) { - IOUtils.copy(is, os); - return; - } - int transformerCount = (maxTransforms == 1) ? 1 : 1 + random.nextInt(maxTransforms); - int[] transformerIndices = new int[transformerCount]; - for (int i = 0; i < transformerCount; i++) { - transformerIndices[i] = random.nextInt(transformers.length); - } - //TODO -- make this actually streaming - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - IOUtils.copy(is, bos); - for (int transformerIndex : transformerIndices) { - byte[] bytes = bos.toByteArray(); - bos = new ByteArrayOutputStream(); - transformers[transformerIndex].transform(new ByteArrayInputStream(bytes), bos); - bos.flush(); - if (bos.toByteArray().length == 0) { - LOG.warn("zero length: " + transformers[transformerIndex]); - } - } - os.write(bos.toByteArray()); - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java deleted file mode 100644 index a15a750e7..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.general; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Collections; -import java.util.Random; -import java.util.Set; - -import org.apache.commons.io.IOUtils; - -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.mime.MediaType; - -/** - * randomly swaps spans from the input - */ -public class SpanSwapper implements Transformer { - - static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); - Random random = new Random(); - int maxSpanLength = 10000; - private final float swapProbability = 0.01f; - - @Override - public Set<MediaType> getSupportedTypes() { - return SUPPORTED_TYPES; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException { - byte[] input = IOUtils.toByteArray(is); - int numSwaps = (int) Math.floor(swapProbability * input.length); - //at least one swap - numSwaps = numSwaps == 0 ? 1 : numSwaps; - byte[] ret = new byte[input.length]; - System.arraycopy(input, 0, ret, 0, input.length); - for (int i = 0; i < numSwaps; i++) { - ret = swap(ret); - } - os.write(ret); - } - - private byte[] swap(byte[] ret) { - if (ret.length == 0) { - return new byte[0]; - } - int srcStart = random.nextInt(ret.length); - int targStart = random.nextInt(ret.length); - //these spans can overlap; - - int len = random.nextInt(maxSpanLength); - int maxStart = Math.max(srcStart, targStart); - len = (len + maxStart < ret.length) ? len : ret.length - maxStart; - - byte[] landingBytes = new byte[len]; - //copy the landing zone - System.arraycopy(ret, targStart, landingBytes, 0, len); - //now copy the src onto the targ - System.arraycopy(ret, srcStart, ret, targStart, len); - //now copy the targ over to the src - System.arraycopy(landingBytes, 0, ret, srcStart, len); - return ret; - } - -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java deleted file mode 100644 index bf5583646..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.general; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Collections; -import java.util.Random; -import java.util.Set; - -import org.apache.commons.io.IOUtils; - -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.mime.MediaType; - -public class Truncator implements Transformer { - - static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); - Random random = new Random(); - - @Override - public Set<MediaType> getSupportedTypes() { - return SUPPORTED_TYPES; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException { - //TODO -- redo streaming - byte[] input = IOUtils.toByteArray(is); - if (input.length == 0) { - return; - } - int len = 1 + random.nextInt(input.length); - //at least one - if (len >= input.length) { - len = input.length - 2; - if (len < 0) { - len = 0; - } - } - - byte[] ret = new byte[len]; - System.arraycopy(input, 0, ret, 0, len); - os.write(ret); - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java deleted file mode 100644 index 4e88f14a0..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java +++ /dev/null @@ -1,1486 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.pdf; - -import java.io.BufferedOutputStream; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.Closeable; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.SequenceInputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.text.DecimalFormat; -import java.text.DecimalFormatSymbols; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Deque; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Random; -import java.util.Set; - -import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; -import org.apache.pdfbox.cos.COSArray; -import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSBoolean; -import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.cos.COSDocument; -import org.apache.pdfbox.cos.COSFloat; -import org.apache.pdfbox.cos.COSInteger; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSNull; -import org.apache.pdfbox.cos.COSNumber; -import org.apache.pdfbox.cos.COSObject; -import org.apache.pdfbox.cos.COSObjectKey; -import org.apache.pdfbox.cos.COSStream; -import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.cos.COSUpdateInfo; -import org.apache.pdfbox.cos.ICOSVisitor; -import org.apache.pdfbox.filter.Filter; -import org.apache.pdfbox.filter.FilterFactory; -import org.apache.pdfbox.io.IOUtils; -import org.apache.pdfbox.io.RandomAccessInputStream; -import org.apache.pdfbox.io.RandomAccessRead; -import org.apache.pdfbox.pdfparser.PDFXRefStream; -import org.apache.pdfbox.pdfparser.xref.FreeXReference; -import org.apache.pdfbox.pdfparser.xref.NormalXReference; -import org.apache.pdfbox.pdfparser.xref.XReferenceEntry; -import org.apache.pdfbox.pdfwriter.COSStandardOutputStream; -import org.apache.pdfbox.pdfwriter.COSWriter; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.common.PDStream; -import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; -import org.apache.pdfbox.pdmodel.fdf.FDFDocument; -import org.apache.pdfbox.pdmodel.interactive.digitalsignature.COSFilterInputStream; -import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface; -import org.apache.pdfbox.util.Hex; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; - -//TODO PDFBOX30 replace COSWriterXRefEntry with XReferenceEntry (and much more) - -public class EvilCOSWriter implements ICOSVisitor, Closeable { - - /** - * The dictionary open token. - */ - public static final byte[] DICT_OPEN = "<<".getBytes(StandardCharsets.US_ASCII); - /** - * The dictionary close token. - */ - public static final byte[] DICT_CLOSE = ">>".getBytes(StandardCharsets.US_ASCII); - /** - * space character. - */ - public static final byte[] SPACE = {' '}; - /** - * The start to a PDF comment. - */ - public static final byte[] COMMENT = {'%'}; - /** - * The output version of the PDF. - */ - public static final byte[] VERSION = "PDF-1.4".getBytes(StandardCharsets.US_ASCII); - /** - * Garbage bytes used to create the PDF header. - */ - public static final byte[] GARBAGE = - new byte[]{(byte) 0xf6, (byte) 0xe4, (byte) 0xfc, (byte) 0xdf}; - /** - * The EOF constant. - */ - public static final byte[] EOF = "%%EOF".getBytes(StandardCharsets.US_ASCII); - /** - * The reference token. - */ - public static final byte[] REFERENCE = "R".getBytes(StandardCharsets.US_ASCII); - // pdf tokens - /** - * The XREF token. - */ - public static final byte[] XREF = "xref".getBytes(StandardCharsets.US_ASCII); - /** - * The xref free token. - */ - public static final byte[] XREF_FREE = "f".getBytes(StandardCharsets.US_ASCII); - /** - * The xref used token. - */ - public static final byte[] XREF_USED = "n".getBytes(StandardCharsets.US_ASCII); - /** - * The trailer token. - */ - public static final byte[] TRAILER = "trailer".getBytes(StandardCharsets.US_ASCII); - /** - * The start xref token. - */ - public static final byte[] STARTXREF = "startxref".getBytes(StandardCharsets.US_ASCII); - /** - * The starting object token. - */ - public static final byte[] OBJ = "obj".getBytes(StandardCharsets.US_ASCII); - /** - * The end object token. - */ - public static final byte[] ENDOBJ = "endobj".getBytes(StandardCharsets.US_ASCII); - /** - * The array open token. - */ - public static final byte[] ARRAY_OPEN = "[".getBytes(StandardCharsets.US_ASCII); - /** - * The array close token. - */ - public static final byte[] ARRAY_CLOSE = "]".getBytes(StandardCharsets.US_ASCII); - /** - * The open stream token. - */ - public static final byte[] STREAM = "stream".getBytes(StandardCharsets.US_ASCII); - /** - * The close stream token. - */ - public static final byte[] ENDSTREAM = "endstream".getBytes(StandardCharsets.US_ASCII); - private static final Logger LOG = LoggerFactory.getLogger(EvilCOSWriter.class); - private final NumberFormat formatXrefOffset = - new DecimalFormat("0000000000", DecimalFormatSymbols.getInstance(Locale.US)); - - // the decimal format for the xref object generation number data - private final NumberFormat formatXrefGeneration = - new DecimalFormat("00000", DecimalFormatSymbols.getInstance(Locale.US)); - // maps the object to the keys generated in the writer - // these are used for indirect references in other objects - //A hashtable is used on purpose over a hashmap - //so that null entries will not get added. - @SuppressWarnings({"squid:S1149"}) - private final Map<COSBase, COSObjectKey> objectKeys = new Hashtable<>(); - private final Map<COSObjectKey, COSBase> keyObject = new HashMap<>(); - // the list of x ref entries to be made so far - private final List<XReferenceEntry> xRefEntries = new ArrayList<>(); - private final Set<COSBase> objectsToWriteSet = new HashSet<>(); - //A list of objects to write. - private final Deque<COSBase> objectsToWrite = new LinkedList<>(); - //a list of objects already written - private final Set<COSBase> writtenObjects = new HashSet<>(); - //An 'actual' is any COSBase that is not a COSObject. - //need to keep a list of the actuals that are added - //as well as the objects because there is a problem - //when adding a COSObject and then later adding - //the actual for that object, so we will track - //actuals separately. - private final Set<COSBase> actualsAdded = new HashSet<>(); - private final PDFTransformerConfig config; - private final Random random = new Random(); - // the stream where we create the pdf output - private OutputStream output; - // the stream used to write standard cos data - private COSStandardOutputStream standardOutput; - // the start position of the x ref section - private long startxref = 0; - // the current object number - private long number = 0; - private int roughNumberOfObjects = 0; - private COSObjectKey currentObjectKey = null; - private PDDocument pdDocument = null; - private FDFDocument fdfDocument = null; - private boolean willEncrypt = false; - // signing - private final boolean incrementalUpdate = false; - private boolean reachedSignature = false; - private long signatureOffset; - private long signatureLength; - private long byteRangeOffset; - private long byteRangeLength; - private RandomAccessRead incrementalInput; - private OutputStream incrementalOutput; - private SignatureInterface signatureInterface; - private byte[] incrementPart; - private COSArray byteRangeArray; - private final FilterFactory filterFactory = FilterFactory.INSTANCE; - - /** - * COSWriter constructor. - * - * @param outputStream The output stream to write the PDF. It will be closed when this object is - * closed. - */ - public EvilCOSWriter(OutputStream outputStream, PDFTransformerConfig config) { - setOutput(outputStream); - setStandardOutput(new COSStandardOutputStream(output)); - this.config = config; - } - - /** - * This will output the given byte getString as a PDF object. - * - * @param string COSString to be written - * @param output The stream to write to. - * @throws IOException If there is an error writing to the stream. - */ - public static void writeString(COSString string, OutputStream output) throws IOException { - writeString(string.getBytes(), string.getForceHexForm(), output); - } - - /** - * This will output the given text/byte getString as a PDF object. - * - * @param bytes byte array representation of a string to be written - * @param output The stream to write to. - * @throws IOException If there is an error writing to the stream. - */ - public static void writeString(byte[] bytes, OutputStream output) throws IOException { - writeString(bytes, false, output); - } - - /** - * This will output the given text/byte string as a PDF object. - * - * @param output The stream to write to. - * @throws IOException If there is an error writing to the stream. - */ - private static void writeString(byte[] bytes, boolean forceHex, OutputStream output) - throws IOException { - // check for non-ASCII characters - boolean isASCII = true; - if (!forceHex) { - for (byte b : bytes) { - // if the byte is negative then it is an eight bit byte and is outside the ASCII range - if (b < 0) { - isASCII = false; - break; - } - // PDFBOX-3107 EOL markers within a string are troublesome - if (b == 0x0d || b == 0x0a) { - isASCII = false; - break; - } - } - } - - if (isASCII && !forceHex) { - // write ASCII string - output.write('('); - for (byte b : bytes) { - switch (b) { - case '(': - case ')': - case '\\': - output.write('\\'); - output.write(b); - break; - default: - output.write(b); - break; - } - } - output.write(')'); - } else { - // write hex string - output.write('<'); - Hex.writeHexBytes(bytes, output); - output.write('>'); - } - } - - private void prepareIncrement(PDDocument doc) throws IOException { - if (doc != null) { - COSDocument cosDoc = doc.getDocument(); - - Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable(); - Set<COSObjectKey> keySet = xrefTable.keySet(); - long highestNumber = doc.getDocument().getHighestXRefObjectNumber(); - for (COSObjectKey cosObjectKey : keySet) { - COSBase object = cosDoc.getObjectFromPool(cosObjectKey).getObject(); - if (object != null && cosObjectKey != null && !(object instanceof COSNumber)) { - objectKeys.put(object, cosObjectKey); - keyObject.put(cosObjectKey, object); - } - - if (cosObjectKey != null) { - long num = cosObjectKey.getNumber(); - if (num > highestNumber) { - highestNumber = num; - } - } - } - setNumber(highestNumber); - } - } - - /** - * add an entry in the x ref table for later dump. - * - * @param entry The new entry to add. - */ - protected void addXRefEntry(XReferenceEntry entry) { - getXRefEntries().add(entry); - } - - /** - * This will close the stream. - * - * @throws IOException If the underlying stream throws an exception. - */ - @Override - public void close() throws IOException { - if (getStandardOutput() != null) { - getStandardOutput().close(); - } - if (incrementalOutput != null) { - incrementalOutput.close(); - } - } - - /** - * This will get the current object number. - * - * @return The current object number. - */ - protected long getNumber() { - return number; - } - - /** - * This will set the current object number. - * - * @param newNumber The new object number. - */ - protected void setNumber(long newNumber) { - number = newNumber; - - } - - /** - * This will get all available object keys. - * - * @return A map of all object keys. - */ - public Map<COSBase, COSObjectKey> getObjectKeys() { - return objectKeys; - } - - /** - * This will get the output stream. - * - * @return The output stream. - */ - protected java.io.OutputStream getOutput() { - return output; - } - - /** - * This will set the output stream. - * - * @param newOutput The new output stream. - */ - private void setOutput(OutputStream newOutput) { - output = newOutput; - } - - /** - * This will get the standard output stream. - * - * @return The standard output stream. - */ - protected COSStandardOutputStream getStandardOutput() { - return standardOutput; - } - - /** - * This will set the standard output stream. - * - * @param newStandardOutput The new standard output stream. - */ - private void setStandardOutput(COSStandardOutputStream newStandardOutput) { - standardOutput = newStandardOutput; - } - - /** - * This will get the current start xref. - * - * @return The current start xref. - */ - protected long getStartxref() { - return startxref; - } - - /** - * This will set the start xref. - * - * @param newStartxref The new start xref attribute. - */ - protected void setStartxref(long newStartxref) { - startxref = newStartxref; - } - - /** - * This will get the xref entries. - * - * @return All available xref entries. - */ - protected List<XReferenceEntry> getXRefEntries() { - return xRefEntries; - } - - /** - * This will write the body of the document. - * - * @param doc The document to write the body for. - * @throws IOException If there is an error writing the data. - */ - protected void doWriteBody(COSDocument doc) throws IOException { - COSDictionary trailer = doc.getTrailer(); - COSDictionary root = trailer.getCOSDictionary(COSName.ROOT); - COSDictionary info = trailer.getCOSDictionary(COSName.INFO); - COSDictionary encrypt = trailer.getCOSDictionary(COSName.ENCRYPT); - roughNumberOfObjects = doc.getXrefTable().size(); - if (root != null) { - addObjectToWrite(root); - } - if (info != null) { - addObjectToWrite(info); - } - - doWriteObjects(); - willEncrypt = false; - if (encrypt != null) { - addObjectToWrite(encrypt); - } - - doWriteObjects(); - } - - private void doWriteObjects() throws IOException { - while (objectsToWrite.size() > 0) { - COSBase nextObject = objectsToWrite.removeFirst(); - objectsToWriteSet.remove(nextObject); - doWriteObject(nextObject); - } - } - - private void addObjectToWrite(COSBase object) { - COSBase actual = object; - if (actual instanceof COSObject) { - actual = ((COSObject) actual).getObject(); - } - - if (!writtenObjects.contains(object) && !objectsToWriteSet.contains(object) && - !actualsAdded.contains(actual)) { - COSBase cosBase = null; - COSObjectKey cosObjectKey = null; - if (actual != null) { - cosObjectKey = objectKeys.get(actual); - } - if (cosObjectKey != null) { - cosBase = keyObject.get(cosObjectKey); - } - if (actual != null && objectKeys.containsKey(actual) && - object instanceof COSUpdateInfo && - !((COSUpdateInfo) object).isNeedToBeUpdated() && - cosBase instanceof COSUpdateInfo && - !((COSUpdateInfo) cosBase).isNeedToBeUpdated()) { - return; - } - objectsToWrite.add(object); - objectsToWriteSet.add(object); - if (actual != null) { - actualsAdded.add(actual); - } - } - } - - public void doWriteObject( COSBase obj ) throws IOException { - writtenObjects.add( obj ); - // find the physical reference - currentObjectKey = getObjectKey( obj ); - doWriteObject(currentObjectKey, obj); - } - - public void doWriteObject(COSObjectKey key, COSBase obj) throws IOException - { - // don't write missing objects to avoid broken xref tables - if (obj == null || (obj instanceof COSObject && ((COSObject) obj).getObject() == null)) - { - return; - } - writtenObjects.add(obj); - // find the physical reference - currentObjectKey = getObjectKey(obj); - - // add a x ref entry - addXRefEntry(new NormalXReference(getStandardOutput().getPos(), key, obj)); - long objectNumber = currentObjectKey.getNumber(); - if (config.getRandomizeObjectNumbers() > 0.0f && - random.nextFloat() < config.getRandomizeObjectNumbers()) { - objectNumber = random.nextInt(((int) objectNumber) * 2); - } - // write the object - getStandardOutput() - .write(Long.toString(objectNumber).getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().write(SPACE); - getStandardOutput() - .write(String.valueOf(key.getGeneration()).getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().write(SPACE); - getStandardOutput().write(OBJ); - getStandardOutput().writeEOL(); - mutate(obj); - if (obj != null) { - writeObjContents(obj); - } - getStandardOutput().writeEOL(); - getStandardOutput().write(ENDOBJ); - getStandardOutput().writeEOL(); - } - - private void writeObjContents(COSBase obj) throws IOException { - if (!(obj instanceof COSObject)) { - obj.accept(this); - return; - } - - COSObject cosObject = (COSObject) obj; - COSBase underlyingObject = cosObject.getObject(); - if (underlyingObject instanceof COSStream && - config.getUnfilteredStreamTransformer() != null) { - COSStream cosStream = (COSStream) underlyingObject; - Transformer unfilteredStreamTransformer = config.getUnfilteredStreamTransformer(); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - try (InputStream is = cosStream.createRawInputStream()) { - IOUtils.copy(is, bos); - } - ByteArrayOutputStream transformed = new ByteArrayOutputStream(); - try { - unfilteredStreamTransformer.transform(new ByteArrayInputStream(bos.toByteArray()), - transformed); - } catch (TikaException e) { - throw new IOException(e); - } - try (OutputStream os = cosStream.createRawOutputStream()) { - IOUtils.copy(new ByteArrayInputStream(transformed.toByteArray()), os); - } - //stream automatically sets the length correctly - obj.accept(this); - } else { - obj.accept(this); - } - } - - private void mutate(COSBase obj) throws IOException { - - //stub - if (obj instanceof COSStream) { - COSStream stream = (COSStream) obj; - //get the raw unfiltered bytes - byte[] bytes = new PDStream(stream).toByteArray(); - //transform the underlying stream _before_ filters are applied - if (config.getStreamTransformer() != null) { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - try { - config.getStreamTransformer().transform(new ByteArrayInputStream(bytes), bos); - } catch (TikaException e) { - throw new IOException(e); - } - bytes = bos.toByteArray(); - } - COSBase filters = getFilters(stream.getFilters()); - if (filters instanceof COSNull) { - stream.removeItem(COSName.FILTER); - } else { - List<COSName> usedFilters = new ArrayList<>(); - long length = -1; - try (TikaInputStream rawBytes = TikaInputStream.get(bytes)) { - try (TikaInputStream filtered = runFilters(filters, rawBytes, usedFilters)) { - //rewrite raw bytes after running own filters - try (OutputStream streamOut = stream.createRawOutputStream()) { - IOUtils.copy(filtered, streamOut); - } - length = filtered.getLength(); - } - } - Collections.reverse(usedFilters); - COSArray actualFilters = new COSArray(); - for (COSName f : usedFilters) { - actualFilters.add(f); - } - //TODO: parameterize wonkifying length and filters - stream.setLong(COSName.LENGTH, length); - stream.setItem(COSName.FILTER, actualFilters); - } - } else if (obj instanceof COSObject) { - COSBase underlyingObject = ((COSObject) obj).getObject(); - mutate(underlyingObject); - - } - } - - private TikaInputStream runFilters(COSBase filters, TikaInputStream is, - List<COSName> usedFilters) throws IOException { - if (filters instanceof COSNull) { - } else if (filters instanceof COSName) { - is = runFilter((COSName) filters, is, new COSDictionary(), 0); - usedFilters.add((COSName) filters); - LOG.debug("filter:" + filters + " " + 0 + " : " + is.getLength()); - } else if (filters instanceof COSArray) { - COSArray filterArray = (COSArray) filters; - //need to apply them in reverse order! - boolean transformed = false; - for (int i = filterArray.size() - 1; i >= 0; i--) { - COSName filter = (COSName) filterArray.get(i); - is = runFilter(filter, is, new COSDictionary(), 0); - if (random.nextFloat() > 0.1 && transformed == false) { - is = transformRawStream(is); - transformed = true; - } - usedFilters.add(filter); - LOG.debug("filter:" + filter.toString() + " " + i + " : " + is.getLength()); - if (is.getLength() > config.getMaxFilteredStreamLength()) { - LOG.debug("stopping early"); - return is; - } - } - return is; - } else { - throw new IllegalArgumentException( - "Can't handle this class here: " + filters.getClass()); - } - return transformRawStream(is); - } - - private TikaInputStream transformRawStream(TikaInputStream is) throws IOException { - if (config.getUnfilteredStreamTransformer() != null) { - if (is.getLength() < 10000000) { - try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { - config.getUnfilteredStreamTransformer().transform(is, bos); - bos.flush(); - bos.close(); - return TikaInputStream.get(bos.toByteArray()); - } catch (TikaException e) { - throw new IOException(e); - } - } else { - TemporaryResources tmp = new TemporaryResources(); - Path p = tmp.createTempFile(); - try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(p))) { - config.getUnfilteredStreamTransformer().transform(is, os); - os.flush(); - } catch (TikaException e) { - throw new IOException(e); - } - return TikaInputStream.get(p, new Metadata(), tmp); - } - } - return is; - } - - private TikaInputStream runFilter(COSName filterCOSName, TikaInputStream tis, - COSDictionary filterParameters, int filterIndex) - throws IOException { - - Filter filter = filterFactory.getFilter(filterCOSName); - if (tis.getLength() < 100000000) { - try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { - filter.encode(tis, bos, filterParameters, filterIndex); - bos.flush(); - bos.close(); - return TikaInputStream.get(bos.toByteArray()); - } finally { - tis.close(); - } - } else { - TemporaryResources tmp = new TemporaryResources(); - Path p = tmp.createTempFile(); - try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(p))) { - filter.encode(tis, os, filterParameters, filterIndex); - } finally { - tis.close(); - } - return TikaInputStream.get(p, new Metadata(), tmp); - } - } - - private COSBase getFilters(COSBase existingFilters) { - List<COSName> filters = config.getFilters(existingFilters); - if (filters.size() == 0) { - return COSNull.NULL; - } else if (filters.size() == 1) { - return filters.get(0); - } else { - COSArray arr = new COSArray(); - for (COSName n : filters) { - arr.add(n); - } - return arr; - } - } - - /** - * This will write the header to the PDF document. - * - * @param doc The document to get the data from. - * @throws IOException If there is an error writing to the stream. - */ - protected void doWriteHeader(COSDocument doc) throws IOException { - String headerString; - if (fdfDocument != null) { - headerString = "%FDF-" + doc.getVersion(); - } else { - headerString = "%PDF-" + doc.getVersion(); - } - getStandardOutput().write(headerString.getBytes(StandardCharsets.ISO_8859_1)); - - getStandardOutput().writeEOL(); - getStandardOutput().write(COMMENT); - getStandardOutput().write(GARBAGE); - getStandardOutput().writeEOL(); - } - - /** - * This will write the trailer to the PDF document. - * - * @param doc The document to create the trailer for. - * @throws IOException If there is an IOError while writing the document. - */ - protected void doWriteTrailer(COSDocument doc) throws IOException { - getStandardOutput().write(TRAILER); - getStandardOutput().writeEOL(); - - COSDictionary trailer = doc.getTrailer(); - //sort xref, needed only if object keys not regenerated - Collections.sort(getXRefEntries()); - XReferenceEntry lastEntry = getXRefEntries().get(getXRefEntries().size() - 1); - - trailer.setLong(COSName.SIZE, lastEntry.getReferencedKey().getNumber() + 1); - // Only need to stay, if an incremental update will be performed - if (!incrementalUpdate) { - trailer.removeItem(COSName.PREV); - } - if (!doc.isXRefStream()) { - trailer.removeItem(COSName.XREF_STM); - } - // Remove a checksum if present - trailer.removeItem(COSName.DOC_CHECKSUM); - - COSArray idArray = trailer.getCOSArray(COSName.ID); - if (idArray != null) { - idArray.setDirect(true); - } - - trailer.accept(this); - } - - private void doWriteXRefInc(COSDocument doc, long hybridPrev) throws IOException { - if (doc.isXRefStream() || hybridPrev != -1) { - // the file uses XrefStreams, so we need to update - // it with an xref stream. We create a new one and fill it - // with data available here - - // create a new XRefStrema object - PDFXRefStream pdfxRefStream = new PDFXRefStream(doc); - - // add all entries from the incremental update. - List<XReferenceEntry> xRefEntries2 = getXRefEntries(); - for (XReferenceEntry cosWriterXRefEntry : xRefEntries2) { - pdfxRefStream.addEntry(cosWriterXRefEntry); - } - - COSDictionary trailer = doc.getTrailer(); - if (incrementalUpdate) { - // use previous startXref value as new PREV value - trailer.setLong(COSName.PREV, doc.getStartXref()); - } else { - trailer.removeItem(COSName.PREV); - } - pdfxRefStream.addTrailerInfo(trailer); - // the size is the highest object number+1. we add one more - // for the xref stream object we are going to write - pdfxRefStream.setSize(getNumber() + 2); - - setStartxref(getStandardOutput().getPos()); - COSStream stream2 = pdfxRefStream.getStream(); - doWriteObject(stream2); - } - - if (!doc.isXRefStream() || hybridPrev != -1) { - COSDictionary trailer = doc.getTrailer(); - trailer.setLong(COSName.PREV, doc.getStartXref()); - if (hybridPrev != -1) { - COSName xrefStm = COSName.XREF_STM; - trailer.removeItem(xrefStm); - trailer.setLong(xrefStm, getStartxref()); - } - doWriteXRefTable(); - doWriteTrailer(doc); - } - } - - // writes the "xref" table - private void doWriteXRefTable() throws IOException { - addXRefEntry(FreeXReference.NULL_ENTRY); - - // sort xref, needed only if object keys not regenerated - Collections.sort(getXRefEntries()); - - // remember the position where x ref was written - setStartxref(getStandardOutput().getPos()); - - getStandardOutput().write(XREF); - getStandardOutput().writeEOL(); - // write start object number and object count for this x ref section - // we assume starting from scratch - - Long[] xRefRanges = getXRefRanges(getXRefEntries()); - int xRefLength = xRefRanges.length; - int x = 0; - int j = 0; - while (x < xRefLength && (xRefLength % 2) == 0) { - writeXrefRange(xRefRanges[x], xRefRanges[x + 1]); - - for (int i = 0; i < xRefRanges[x + 1]; ++i) { - writeXrefEntry(xRefEntries.get(j++)); - } - x += 2; - } - } - - /** - * Write an incremental update for a non signature case. This can be used for e.g. augmenting - * signatures. - * - * @throws IOException - */ - private void doWriteIncrement() throws IOException { - // write existing PDF - IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput); - // write the actual incremental update - incrementalOutput.write(getBytes(output)); - } - - private void doWriteSignature() throws IOException { - // calculate the ByteRange values - long inLength = incrementalInput.length(); - long beforeLength = signatureOffset; - long afterOffset = signatureOffset + signatureLength; - long afterLength = getStandardOutput().getPos() - (inLength + signatureLength) - - (signatureOffset - inLength); - - String byteRange = "0 " + beforeLength + " " + afterOffset + " " + afterLength + "]"; - - // Assign the values to the actual COSArray, so that the user can access it before closing - byteRangeArray.set(0, COSInteger.ZERO); - byteRangeArray.set(1, COSInteger.get(beforeLength)); - byteRangeArray.set(2, COSInteger.get(afterOffset)); - byteRangeArray.set(3, COSInteger.get(afterLength)); - - if (byteRange.length() > byteRangeLength) { - throw new IOException("Can't write new byteRange '" + byteRange + - "' not enough space: byteRange.length(): " + byteRange.length() + - ", byteRangeLength: " + byteRangeLength); - } - - // copy the new incremental data into a buffer (e.g. signature dict, trailer) - output.flush(); - incrementPart = getBytes(output); - - // overwrite the ByteRange in the buffer - byte[] byteRangeBytes = byteRange.getBytes(StandardCharsets.ISO_8859_1); - for (int i = 0; i < byteRangeLength; i++) { - if (i >= byteRangeBytes.length) { - incrementPart[(int) (byteRangeOffset + i - inLength)] = 0x20; // SPACE - } else { - incrementPart[(int) (byteRangeOffset + i - inLength)] = byteRangeBytes[i]; - } - } - - if (signatureInterface != null) { - // data to be signed - try (InputStream dataToSign = getDataToSign()) { - // sign the bytes - byte[] signatureBytes = signatureInterface.sign(dataToSign); - writeExternalSignature(signatureBytes); - } - } - // else signature should created externally and set via writeSignature() - } - - /** - * Return the stream of PDF data to be signed. Clients should use this method only to create - * signatures externally. {@link #write(PDDocument)} method should have been called prior. The - * created signature should be set using {@link #writeExternalSignature(byte[])}. - * <p> - * When {@link SignatureInterface} instance is used, COSWriter obtains and writes the signature - * itself. - * </p> - * - * @return data stream to be signed - * @throws IllegalStateException if PDF is not prepared for external signing - * @throws IOException if input data is closed - */ - public InputStream getDataToSign() throws IOException { - if (incrementPart == null || incrementalInput == null) { - throw new IllegalStateException("PDF not prepared for signing"); - } - // range of incremental bytes to be signed (includes /ByteRange but not /Contents) - int incPartSigOffset = (int) (signatureOffset - incrementalInput.length()); - int afterSigOffset = incPartSigOffset + (int) signatureLength; - int[] range = {0, incPartSigOffset, afterSigOffset, incrementPart.length - afterSigOffset}; - - return new SequenceInputStream(new RandomAccessInputStream(incrementalInput), - new COSFilterInputStream(incrementPart, range)); - } - - /** - * Write externally created signature of PDF data obtained via {@link #getDataToSign()} method. - * - * @param cmsSignature CMS signature byte array - * @throws IllegalStateException if PDF is not prepared for external signing - * @throws IOException if source data stream is closed - */ - public void writeExternalSignature(byte[] cmsSignature) throws IOException { - - if (incrementPart == null || incrementalInput == null) { - throw new IllegalStateException("PDF not prepared for setting signature"); - } - byte[] signatureBytes = Hex.getBytes(cmsSignature); - - // subtract 2 bytes because of the enclosing "<>" - if (signatureBytes.length > signatureLength - 2) { - throw new IOException("Can't write signature, not enough space"); - } - - // overwrite the signature Contents in the buffer - int incPartSigOffset = (int) (signatureOffset - incrementalInput.length()); - System.arraycopy(signatureBytes, 0, incrementPart, incPartSigOffset + 1, - signatureBytes.length); - - // write the data to the incremental output stream - IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput); - incrementalOutput.write(incrementPart); - - // prevent further use - incrementPart = null; - } - - private void writeXrefRange(long x, long y) throws IOException { - getStandardOutput().write(String.valueOf(x).getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().write(SPACE); - getStandardOutput().write(String.valueOf(y).getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().writeEOL(); - } - - private void writeXrefEntry(XReferenceEntry entry) throws IOException - { - String offset = formatXrefOffset.format(entry.getSecondColumnValue()); - String generation = formatXrefGeneration.format(entry.getThirdColumnValue()); - getStandardOutput().write(offset.getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().write(SPACE); - getStandardOutput().write(generation.getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().write(SPACE); - getStandardOutput().write(entry instanceof FreeXReference ? XREF_FREE : XREF_USED); - getStandardOutput().writeCRLF(); - } - - /** - * check the xref entries and write out the ranges. The format of the - * returned array is exactly the same as the pdf specification. See section - * 7.5.4 of ISO32000-1:2008, example 1 (page 40) for reference. - * <p> - * example: 0 1 2 5 6 7 8 10 - * <p> - * will create a array with follow ranges - * <p> - * 0 3 5 4 10 1 - * <p> - * this mean that the element 0 is followed by two other related numbers - * that represent a cluster of the size 3. 5 is follow by three other - * related numbers and create a cluster of size 4. etc. - * - * @param xRefEntriesList list with the xRef entries that was written - * @return a integer array with the ranges - */ - protected Long[] getXRefRanges(List<XReferenceEntry> xRefEntriesList) { - long last = -2; - long count = 1; - - List<Long> list = new ArrayList<>(); - for (XReferenceEntry object : xRefEntriesList) { - long nr = (int) object.getReferencedKey().getNumber(); - if (nr == last + 1) { - ++count; - last = nr; - } else if (last == -2) { - last = nr; - } else { - list.add(last - count + 1); - list.add(count); - last = nr; - count = 1; - } - } - // If no new entry is found, we need to write out the last result - if (xRefEntriesList.size() > 0) { - list.add(last - count + 1); - list.add(count); - } - return list.toArray(new Long[0]); - } - - /** - * This will get the object key for the object. - * - * @param obj The object to get the key for. - * @return The object key for the object. - */ - private COSObjectKey getObjectKey(COSBase obj) { - COSBase actual = obj; - if (actual instanceof COSObject) { - actual = ((COSObject) obj).getObject(); - } - // PDFBOX-4540: because objectKeys is accessible from outside, it is possible - // that a COSObject obj is already in the objectKeys map. - COSObjectKey key = objectKeys.get(obj); - if (key == null && actual != null) { - key = objectKeys.get(actual); - } - if (key == null) { - setNumber(getNumber() + 1); - key = new COSObjectKey(getNumber(), 0); - objectKeys.put(obj, key); - if (actual != null) { - objectKeys.put(actual, key); - } - } - return key; - } - - @Override - public void visitFromArray(COSArray obj) throws IOException { - int count = 0; - getStandardOutput().write(ARRAY_OPEN); - for (Iterator<COSBase> i = obj.iterator(); i.hasNext(); ) { - COSBase current = i.next(); - if (current instanceof COSDictionary) { - if (current.isDirect()) { - visitFromDictionary((COSDictionary) current); - } else { - addObjectToWrite(current); - writeReference(current); - } - } else if (current instanceof COSObject) { - COSBase subValue = ((COSObject) current).getObject(); - if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary || - subValue == null) { - // PDFBOX-4308: added willEncrypt to prevent an object - // that is referenced several times from being written - // direct and indirect, thus getting encrypted - // with wrong object number or getting encrypted twice - addObjectToWrite(current); - writeReference(current); - } else { - subValue.accept(this); - } - } else if (current == null) { - COSNull.NULL.accept(this); - } else { - current.accept(this); - } - count++; - if (i.hasNext()) { - if (count % 10 == 0) { - getStandardOutput().writeEOL(); - } else { - getStandardOutput().write(SPACE); - } - } - } - getStandardOutput().write(ARRAY_CLOSE); - getStandardOutput().writeEOL(); - } - - @Override - public void visitFromBoolean(COSBoolean obj) throws IOException { - obj.writePDF(getStandardOutput()); - } - - @Override - public void visitFromDictionary(COSDictionary obj) throws IOException { - if (!reachedSignature) { - COSBase itemType = obj.getItem(COSName.TYPE); - if (COSName.SIG.equals(itemType) || COSName.DOC_TIME_STAMP.equals(itemType)) { - reachedSignature = true; - } - } - getStandardOutput().write(DICT_OPEN); - getStandardOutput().writeEOL(); - for (Map.Entry<COSName, COSBase> entry : obj.entrySet()) { - COSBase value = entry.getValue(); - if (value != null) { - entry.getKey().accept(this); - getStandardOutput().write(SPACE); - if (value instanceof COSDictionary) { - COSDictionary dict = (COSDictionary) value; - - if (!incrementalUpdate) { - // write all XObjects as direct objects, this will save some size - // PDFBOX-3684: but avoid dictionary that references itself - COSBase item = dict.getItem(COSName.XOBJECT); - if (item != null && !COSName.XOBJECT.equals(entry.getKey())) { - item.setDirect(true); - } - item = dict.getItem(COSName.RESOURCES); - if (item != null && !COSName.RESOURCES.equals(entry.getKey())) { - item.setDirect(true); - } - } - - if (dict.isDirect()) { - // If the object should be written direct, we need - // to pass the dictionary to the visitor again. - visitFromDictionary(dict); - } else { - addObjectToWrite(dict); - writeReference(dict); - } - } else if (value instanceof COSObject) { - COSBase subValue = ((COSObject) value).getObject(); - if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary || - subValue == null) { - // PDFBOX-4308: added willEncrypt to prevent an object - // that is referenced several times from being written - // direct and indirect, thus getting encrypted - // with wrong object number or getting encrypted twice - addObjectToWrite(value); - writeReference(value); - } else { - subValue.accept(this); - } - } else { - // If we reach the pdf signature, we need to determinate the position of the - // content and byterange - if (reachedSignature && COSName.CONTENTS.equals(entry.getKey())) { - signatureOffset = getStandardOutput().getPos(); - value.accept(this); - signatureLength = getStandardOutput().getPos() - signatureOffset; - } else if (reachedSignature && COSName.BYTERANGE.equals(entry.getKey())) { - byteRangeArray = (COSArray) entry.getValue(); - byteRangeOffset = getStandardOutput().getPos() + 1; - value.accept(this); - byteRangeLength = getStandardOutput().getPos() - 1 - byteRangeOffset; - reachedSignature = false; - } else { - value.accept(this); - } - } - getStandardOutput().writeEOL(); - - } else { - //then we won't write anything, there are a couple cases - //were the value of an entry in the COSDictionary will - //be a dangling reference that points to nothing - //so we will just not write out the entry if that is the case - } - } - getStandardOutput().write(DICT_CLOSE); - getStandardOutput().writeEOL(); - } - - @Override - public void visitFromDocument(COSDocument doc) throws IOException { - if (!incrementalUpdate) { - doWriteHeader(doc); - } else { - // Sometimes the original file will be missing a newline at the end - // In order to avoid having %%EOF the first object on the same line - // as the %%EOF, we put a newline here. If there's already one at - // the end of the file, an extra one won't hurt. PDFBOX-1051 - getStandardOutput().writeCRLF(); - } - - doWriteBody(doc); - - // get the previous trailer - COSDictionary trailer = doc.getTrailer(); - long hybridPrev = -1; - - if (trailer != null) { - hybridPrev = trailer.getLong(COSName.XREF_STM); - } - - if (incrementalUpdate || doc.isXRefStream()) { - doWriteXRefInc(doc, hybridPrev); - } else { - doWriteXRefTable(); - doWriteTrailer(doc); - } - - // write endof - getStandardOutput().write(STARTXREF); - getStandardOutput().writeEOL(); - getStandardOutput().write( - String.valueOf(getStartxref()).getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().writeEOL(); - getStandardOutput().write(EOF); - getStandardOutput().writeEOL(); - - if (incrementalUpdate) { - if (signatureOffset == 0 || byteRangeOffset == 0) { - doWriteIncrement(); - } else { - doWriteSignature(); - } - } - } - - @Override - public void visitFromFloat(COSFloat obj) throws IOException { - obj.writePDF(getStandardOutput()); - - } - - @Override - public void visitFromInt(COSInteger obj) throws IOException { - obj.writePDF(getStandardOutput()); - } - - @Override - public void visitFromName(COSName obj) throws IOException { - obj.writePDF(getStandardOutput()); - } - - @Override - public void visitFromNull(COSNull obj) throws IOException { - obj.writePDF(getStandardOutput()); - } - - /** - * visitFromObjRef method comment. - * - * @param obj The object that is being visited. - * @throws IOException If there is an exception while visiting this object. - */ - public void writeReference(COSBase obj) throws IOException { - COSObjectKey key = getObjectKey(obj); - float randomThreshold = config.getRandomizeRefNumbers(); - float r = random.nextFloat(); - if (randomThreshold > 0.0f && r < randomThreshold) { - long num = random.nextInt(roughNumberOfObjects); - LOG.debug("corrupting ref number: " + key.getNumber() + " -> " + num); - getStandardOutput().write(String.valueOf(num).getBytes(StandardCharsets.ISO_8859_1)); - } else { - getStandardOutput().write( - String.valueOf(key.getNumber()).getBytes(StandardCharsets.ISO_8859_1)); - - } - getStandardOutput().write(SPACE); - getStandardOutput().write( - String.valueOf(key.getGeneration()).getBytes(StandardCharsets.ISO_8859_1)); - getStandardOutput().write(SPACE); - getStandardOutput().write(REFERENCE); - } - - @Override - public void visitFromStream(COSStream obj) throws IOException { - if (willEncrypt) { - pdDocument.getEncryption().getSecurityHandler() - .encryptStream(obj, currentObjectKey.getNumber(), - currentObjectKey.getGeneration()); - } - - InputStream input = null; - try { - // write the stream content - visitFromDictionary(obj); - getStandardOutput().write(STREAM); - getStandardOutput().writeCRLF(); - - input = obj.createRawInputStream(); - IOUtils.copy(input, getStandardOutput()); - - getStandardOutput().writeCRLF(); - getStandardOutput().write(ENDSTREAM); - getStandardOutput().writeEOL(); - } finally { - if (input != null) { - input.close(); - } - } - - } - - @Override - public void visitFromString(COSString obj) throws IOException { - if (willEncrypt) { - pdDocument.getEncryption().getSecurityHandler() - .encryptString(obj, currentObjectKey.getNumber(), - currentObjectKey.getGeneration()); - } - COSWriter.writeString(obj, getStandardOutput()); - } - - /** - * This will write the pdf document. } - * - * @param doc The document to write. - * @throws IOException If an error occurs while generating the data. - */ - public void write(COSDocument doc) throws IOException { - PDDocument pdDoc = new PDDocument(doc); - write(pdDoc); - } - - /** - * This will write the pdf document. If signature should be created externally, - * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method. - * - * @param doc The document to write. - * @throws IOException If an error occurs while generating the data. - */ - public void write(PDDocument doc) throws IOException { - write(doc, null); - } - - /** - * This will write the pdf document. If signature should be created externally, - * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method. - * - * @param doc The document to write. - * @param signInterface class to be used for signing; {@code null} if external signing would be performed - * or there will be no signing at all - * @throws IOException If an error occurs while generating the data. - * @throws IllegalStateException If the document has an encryption dictionary but no protection - * policy. - */ - public void write(PDDocument doc, SignatureInterface signInterface) throws IOException { - long idTime = - doc.getDocumentId() == null ? System.currentTimeMillis() : doc.getDocumentId(); - - pdDocument = doc; - signatureInterface = signInterface; - - if (incrementalUpdate) { - prepareIncrement(doc); - } - - // if the document says we should remove encryption, then we shouldn't encrypt - if (doc.isAllSecurityToBeRemoved()) { - willEncrypt = false; - // also need to get rid of the "Encrypt" in the trailer so readers - // don't try to decrypt a document which is not encrypted - COSDocument cosDoc = doc.getDocument(); - COSDictionary trailer = cosDoc.getTrailer(); - trailer.removeItem(COSName.ENCRYPT); - } else { - if (pdDocument.getEncryption() != null) { - if (!incrementalUpdate) { - SecurityHandler securityHandler = - pdDocument.getEncryption().getSecurityHandler(); - if (!securityHandler.hasProtectionPolicy()) { - throw new IllegalStateException( - "PDF contains an encryption dictionary, please remove it with " + - "setAllSecurityToBeRemoved() or set a protection policy with protect()"); - } - securityHandler.prepareDocumentForEncryption(pdDocument); - } - willEncrypt = true; - } else { - willEncrypt = false; - } - } - - COSDocument cosDoc = pdDocument.getDocument(); - COSDictionary trailer = cosDoc.getTrailer(); - COSArray idArray; - boolean missingID = true; - COSBase base = trailer.getDictionaryObject(COSName.ID); - if (base instanceof COSArray) { - idArray = (COSArray) base; - if (idArray.size() == 2) { - missingID = false; - } - } else { - idArray = new COSArray(); - } - if (missingID || incrementalUpdate) { - MessageDigest md5; - try { - md5 = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - // should never happen - throw new RuntimeException(e); - } - - // algorithm says to use time/path/size/values in doc to generate the id. - // we don't have path or size, so do the best we can - md5.update(Long.toString(idTime).getBytes(StandardCharsets.ISO_8859_1)); - - COSDictionary info = trailer.getCOSDictionary(COSName.INFO); - if (info != null) { - for (COSBase cosBase : info.getValues()) { - md5.update(cosBase.toString().getBytes(StandardCharsets.ISO_8859_1)); - } - } - // reuse origin documentID if available as first value - COSString firstID = - missingID ? new COSString(md5.digest()) : (COSString) idArray.get(0); - // it's ok to use the same ID for the second part if the ID is created for the first time - COSString secondID = missingID ? firstID : new COSString(md5.digest()); - idArray = new COSArray(); - idArray.add(firstID); - idArray.add(secondID); - trailer.setItem(COSName.ID, idArray); - } - cosDoc.accept(this); - } - - /** - * This will write the fdf document. - * - * @param doc The document to write. - * @throws IOException If an error occurs while generating the data. - */ - public void write(FDFDocument doc) throws IOException { - fdfDocument = doc; - willEncrypt = false; - COSDocument cosDoc = fdfDocument.getDocument(); - cosDoc.accept(this); - } - - private byte[] getBytes(OutputStream stream) throws IOException { - if (stream instanceof ByteArrayOutputStream) { - return ((ByteArrayOutputStream) stream).toByteArray(); - } else if (stream instanceof UnsynchronizedByteArrayOutputStream) { - return ((UnsynchronizedByteArrayOutputStream) stream).toByteArray(); - } - throw new IOException("OutputStream " + stream.getClass().getName() + " is not supported"); - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java deleted file mode 100644 index d4edac739..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.pdf; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Collections; -import java.util.Set; - -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.RandomAccessReadBuffer; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.fuzzing.exceptions.CantFuzzException; -import org.apache.tika.mime.MediaType; - -public class PDFTransformer implements Transformer { - private static final Set<MediaType> SUPPORTED_TYPES = - Collections.singleton(MediaType.application("pdf")); - private PDFTransformerConfig config = new PDFTransformerConfig(); - - @Override - public Set<MediaType> getSupportedTypes() { - return SUPPORTED_TYPES; - } - - @Override - public void transform(InputStream is, OutputStream os) throws IOException, TikaException { - try (PDDocument pdDocument = Loader.loadPDF(new RandomAccessReadBuffer(is))) { - //some docs have security which prevents mods and writing - //given our purposes here, we should remove security - pdDocument.setAllSecurityToBeRemoved(true); - try (EvilCOSWriter cosWriter = new EvilCOSWriter(os, config)) { - cosWriter.write(pdDocument); - } - } catch (InvalidPasswordException e) { - throw new CantFuzzException("encrypted doc"); - } - } - - public void setConfig(PDFTransformerConfig pdfTransformerConfig) { - this.config = pdfTransformerConfig; - } -} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java deleted file mode 100644 index a494d4a72..000000000 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.fuzzing.pdf; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Random; -import java.util.Set; - -import org.apache.pdfbox.cos.COSArray; -import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSName; - -import org.apache.tika.fuzzing.Transformer; -import org.apache.tika.fuzzing.general.ByteDeleter; -import org.apache.tika.fuzzing.general.ByteFlipper; -import org.apache.tika.fuzzing.general.ByteInjector; -import org.apache.tika.fuzzing.general.GeneralTransformer; -import org.apache.tika.fuzzing.general.SpanSwapper; -import org.apache.tika.fuzzing.general.Truncator; - -public class PDFTransformerConfig { - - private final Random random = new Random(); - - private float randomizeObjectNumbers = -1.0f; - - private float randomizeRefNumbers = -1.0f; - - private int maxFilters = 1; - private int minFilters = 1; - - private long maxFilteredStreamLength = -1; - - private Set<COSName> allowableFilters = new HashSet<>(); - - private Transformer streamTransformer = - new GeneralTransformer(1, new ByteDeleter(), new ByteFlipper(), new ByteInjector(), - new SpanSwapper(), new Truncator()); - - private Transformer unfilteredStreamTransformer = - new GeneralTransformer(1, new ByteDeleter(), new ByteFlipper(), new ByteInjector(), - new SpanSwapper(), new Truncator()); - - public float getRandomizeObjectNumbers() { - return randomizeObjectNumbers; - } - - /** - * @param randomizeObjectNumbers probability that a given object number will be randomized. - * If < 0, this will be ignored. - */ - public void setRandomizeObjectNumbers(float randomizeObjectNumbers) { - this.randomizeObjectNumbers = randomizeObjectNumbers; - } - - public float getRandomizeRefNumbers() { - return randomizeRefNumbers; - } - - /** - * @param randomizeRefNumbers probability that a given reference number will be randomized. - * If < 0, this will be ignored. - */ - public void setRandomizeRefNumbers(float randomizeRefNumbers) { - this.randomizeRefNumbers = randomizeRefNumbers; - } - - public Transformer getUnfilteredStreamTransformer() { - return unfilteredStreamTransformer; - } - - /** - * This transformer is applied to the stream _before_ any filters - * are applied. - * - * @param transformer - */ - public void setUnfilteredStreamTransformer(Transformer transformer) { - this.unfilteredStreamTransformer = transformer; - } - - public Transformer getStreamTransformer() { - return streamTransformer; - } - - /** - * This transformer is applied to the stream _after_ each filter has been applied. - * - * @param transformer - */ - public void setStreamTransformer(Transformer transformer) { - this.streamTransformer = transformer; - } - - /** - * @param maxFilters maximum number of filters to apply - */ - public void setMaxFilters(int maxFilters) { - this.maxFilters = maxFilters; - } - - /** - * Which filters are allowed - * - * @return - */ - public Set<COSName> getAllowableFilters() { - return allowableFilters; - } - - public void setAllowableFilters(Set<COSName> allowableFilters) { - this.allowableFilters = allowableFilters; - } - - /** - * If {@link #maxFilters} > 0, this will randomly select filters given - * the {@link #maxFilters} and {@link #minFilters}. If {@link #maxFilters} < 0, - * this will return the existing filters. - * - * @param existingFilters - * @return - */ - public List<COSName> getFilters(COSBase existingFilters) { - if (maxFilters < 0) { - List<COSName> ret = new ArrayList<>(); - if (existingFilters instanceof COSArray) { - for (COSBase obj : ((COSArray) existingFilters)) { - ret.add((COSName) obj); - } - } else if (existingFilters instanceof COSName) { - ret.add((COSName) existingFilters); - } - return ret; - } - - int numFilters; - if (maxFilters - minFilters == 0) { - numFilters = maxFilters; - } else { - numFilters = minFilters + random.nextInt(maxFilters - minFilters); - } - - List<COSName> allowable = new ArrayList<>(allowableFilters); - - List<COSName> filters = new ArrayList<>(); - for (int i = 0; i < numFilters; i++) { - int index = random.nextInt(allowable.size()); - filters.add(allowable.get(index)); - } - return filters; - } - - /** - * Minimum number of filters to apply to streams. - * - * @param minFilters - */ - public void setMinFilters(int minFilters) { - this.minFilters = minFilters; - } - - public long getMaxFilteredStreamLength() { - return maxFilteredStreamLength; - } - - /** - * Maximum filtered stream length. AsciiHex doubles the size of the stream with - * each encoding. This is used as a circuit breaker to stop adding filters - * if the stream goes above a given length. - * - * @param maxFilteredStreamLength - */ - public void setMaxFilteredStreamLength(long maxFilteredStreamLength) { - this.maxFilteredStreamLength = maxFilteredStreamLength; - } -} diff --git a/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer b/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer deleted file mode 100644 index 07390de1a..000000000 --- a/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer +++ /dev/null @@ -1,17 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.fuzzing.general.GeneralTransformer -#org.apache.tika.fuzzing.pdf.PDFTransformer \ No newline at end of file diff --git a/tika-fuzzing/src/main/resources/log4j2.xml b/tika-fuzzing/src/main/resources/log4j2.xml deleted file mode 100644 index 94ac22b3e..000000000 --- a/tika-fuzzing/src/main/resources/log4j2.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<Configuration status="WARN"> - <Appenders> - <Console name="Console" target="SYSTEM_ERR"> - <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/> - </Console> - </Appenders> - <Loggers> - <Root level="info"> - <AppenderRef ref="Console"/> - </Root> - <Logger name="org.apache.tika.pipes" level="error" additivity="false"> - <AppenderRef ref="Console"/> - </Logger> - <Logger name="com.github.junrar" level="error" additivity="false"> - <AppenderRef ref="Console"/> - </Logger> - </Loggers> -</Configuration> diff --git a/tika-fuzzing/src/test/java/TestFuzzingCLI.java b/tika-fuzzing/src/test/java/TestFuzzingCLI.java deleted file mode 100644 index 9e3e49dc0..000000000 --- a/tika-fuzzing/src/test/java/TestFuzzingCLI.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; - -import org.apache.commons.io.FileUtils; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import org.apache.tika.fuzzing.cli.FuzzingCLI; -import org.apache.tika.utils.ProcessUtils; - -public class TestFuzzingCLI { - - @Test - @Disabled - public void testBasic() throws Exception { - //convert to actual unit test - String inputDir = "";// fill in - String outputDir = "";//fill in - String[] args = new String[]{"-i", inputDir, "-o", outputDir, "-n", "8", // num threads - "-t", "1", //max transformers - "-p", "100", //per file iterations - "-r", "3"}; - FuzzingCLI.main(args); - } - - @Test - @Disabled - public void testMock() throws Exception { - //convert to actual unit test - Path inputDir = Paths.get(getClass().getResource("/test-documents").toURI()); - Path outputDir = Files.createTempDirectory("tika-fuzzing-"); - String[] args = new String[]{"-i", - ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()), "-o", - ProcessUtils.escapeCommandLine(outputDir.toAbsolutePath().toString()), "-n", "8", - // num threads - "-t", "0", //max transformers - "-p", "10", //per file iterations - "-m", "10000", //max ms per file - "-r", "3"}; - try { - FuzzingCLI.main(args); - } finally { - FileUtils.deleteDirectory(outputDir.toFile()); - } - } -} diff --git a/tika-fuzzing/src/test/java/TestTransformer.java b/tika-fuzzing/src/test/java/TestTransformer.java deleted file mode 100644 index 3adc4e3e1..000000000 --- a/tika-fuzzing/src/test/java/TestTransformer.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Arrays; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import org.apache.tika.fuzzing.general.GeneralTransformer; - -public class TestTransformer { - - @Test - @Disabled - public void testBasic() throws Exception { - //turn into actual unit test - Path path = Paths.get("");//put something meaningful here - - GeneralTransformer transformer = new GeneralTransformer(); - byte[] bytes = Files.readAllBytes(path); - - for (int i = 0; i < 100; i++) { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - transformer.transform(new ByteArrayInputStream(bytes), bos); - - if (Arrays.equals(bos.toByteArray(), bytes)) { - System.out.println("SAME"); - } - } - } -} diff --git a/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml b/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml deleted file mode 100644 index 2210ae68e..000000000 --- a/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml +++ /dev/null @@ -1,57 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<!-- this is an example configuration file to run the fuzzer against - an input directory. Make sure to specify the input file directory - in the base paths. We need the "empty" fetchers and emitters to - handle the temp files that are created via fuzzing--> -<properties> - <fetchers> - <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher"> - <name>fsf</name> - <basePath>{FILL_IN_HERE}</basePath> - </fetcher> - <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher"> - <name>temp</name> - </fetcher> - </fetchers> - <emitters> - <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter"> - <name>fse</name> - <basePath>{FILL_IN_HERE}</basePath> - </emitter> - <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter"> - <name>temp</name> - </emitter> - </emitters> - <pipesIterator class="org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator"> - <basePath>{FILL_IN_HERE}</basePath> - <fetcherName>fsf</fetcherName> - <emitterName>fse</emitterName> - </pipesIterator> - <pipes> - <numClients>5</numClients> - <forkedJvmArgs> - <arg>-Xmx1g</arg> - <arg>-XX:ParallelGCThreads=2</arg> - <arg>-Dlog4j.configurationFile={FILL_IN_HERE}</arg> - </forkedJvmArgs> - <timeoutMillis>10000</timeoutMillis> - </pipes> -</properties> \ No newline at end of file diff --git a/tika-fuzzing/src/test/resources/log4j2.xml b/tika-fuzzing/src/test/resources/log4j2.xml deleted file mode 100644 index eaeca677e..000000000 --- a/tika-fuzzing/src/test/resources/log4j2.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<Configuration status="WARN"> - <Appenders> - <Console name="Console" target="SYSTEM_ERR"> - <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/> - </Console> - </Appenders> - <Loggers> - <Root level="info"> - <AppenderRef ref="Console"/> - </Root> - <Logger name="org.apache.tika.pipes" level="error" additivity="false"> - <AppenderRef ref="Console"/> - </Logger> - <Logger name="com.github.junrar" level="error" additivity="false"> - <AppenderRef ref="Console"/> - </Logger> - <Logger name="org.apache.pdfbox" level="fatal" additivity="false"> - <AppenderRef ref="Console"/> - </Logger> - - </Loggers> -</Configuration> \ No newline at end of file diff --git a/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml b/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml deleted file mode 100644 index c9e028a5d..000000000 --- a/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml +++ /dev/null @@ -1,25 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<mock> - <metadata action="add" name="author">Nikolai Lobachevsky</metadata> - <write element="p">some content</write> - <hang millis="30000" heavy="true" pulse_millis="100"/> -</mock> \ No newline at end of file diff --git a/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml b/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml deleted file mode 100644 index e497da5b7..000000000 --- a/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml +++ /dev/null @@ -1,25 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<mock> - <metadata action="add" name="author">Nikolai Lobachevsky</metadata> - <write element="p">some content</write> - <throw class="java.lang.NullPointerException">another null pointer exception</throw> -</mock> \ No newline at end of file diff --git a/tika-fuzzing/src/test/resources/test-documents/system_exit.xml b/tika-fuzzing/src/test/resources/test-documents/system_exit.xml deleted file mode 100644 index 52feede1c..000000000 --- a/tika-fuzzing/src/test/resources/test-documents/system_exit.xml +++ /dev/null @@ -1,25 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<mock> - <metadata action="add" name="author">Nikolai Lobachevsky</metadata> - <write element="p">some content</write> - <system_exit/> -</mock> \ No newline at end of file
