This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3083 in repository https://gitbox.apache.org/repos/asf/tika.git
commit bd56182c548b027883ceb69d29a91a6aae3c081a Author: tallison <[email protected]> AuthorDate: Fri Apr 3 17:41:18 2020 -0400 TIKA-3083 -- add fuzzing module --- pom.xml | 1 + tika-fuzzing/pom.xml | 59 + .../apache/tika/fuzzing/AutoDetectTransformer.java | 96 ++ .../java/org/apache/tika/fuzzing/Transformer.java | 41 + .../java/org/apache/tika/fuzzing/cli/FuzzOne.java | 266 ++++ .../org/apache/tika/fuzzing/cli/FuzzingCLI.java | 240 ++++ .../apache/tika/fuzzing/cli/FuzzingCLIConfig.java | 160 +++ .../tika/fuzzing/exceptions/CantFuzzException.java | 25 + .../apache/tika/fuzzing/general/ByteDeleter.java | 53 + .../apache/tika/fuzzing/general/ByteFlipper.java | 67 + .../apache/tika/fuzzing/general/ByteInjector.java | 76 ++ .../tika/fuzzing/general/GeneralTransformer.java | 95 ++ .../apache/tika/fuzzing/general/SpanSwapper.java | 84 ++ .../org/apache/tika/fuzzing/general/Truncator.java | 60 + .../org/apache/tika/fuzzing/pdf/EvilCOSWriter.java | 1283 ++++++++++++++++++++ .../apache/tika/fuzzing/pdf/PDFTransformer.java | 52 + .../tika/fuzzing/pdf/PDFTransformerConfig.java | 26 + .../services/org.apache.tika.fuzzing.Transformer | 17 + tika-fuzzing/src/main/resources/log4j.properties | 24 + tika-fuzzing/src/test/java/TestFuzzingCLI.java | 67 + tika-fuzzing/src/test/java/TestTransformer.java | 49 + .../test/resources/test-documents/heavy_hang.xml | 25 + .../test/resources/test-documents/null_pointer.xml | 25 + .../test/resources/test-documents/system_exit.xml | 25 + 24 files changed, 2916 insertions(+) diff --git a/pom.xml b/pom.xml index 89ee2e2..486c789 100644 --- a/pom.xml +++ b/pom.xml @@ -44,6 +44,7 @@ <module>tika-batch</module> <module>tika-app</module> <module>tika-server</module> + <module>tika-fuzzing</module> <module>tika-translate</module> <module>tika-langdetect</module> <module>tika-example</module> diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml new file mode 100644 index 0000000..19c89ed --- /dev/null +++ b/tika-fuzzing/pom.xml @@ -0,0 +1,59 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> + <version>2.0.0-SNAPSHOT</version> + <relativePath>../tika-parent/pom.xml</relativePath> + </parent> + + <artifactId>tika-fuzzing</artifactId> + <name>Apache Tika fuzzing</name> + <url>http://tika.apache.org/</url> + + <modelVersion>4.0.0</modelVersion> + + + <dependencies> + <dependency> + <groupId>commons-cli</groupId> + <artifactId>commons-cli</artifactId> + <version>${cli.version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>${project.version}</version> + </dependency> + <!-- logging --> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jul-to-slf4j</artifactId> + </dependency> + <!-- test --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <!-- bring in the mock parser --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + </dependencies> + +</project> \ No newline at end of file diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java new file mode 100644 index 0000000..f27f4a0 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing; + +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.fuzzing.general.GeneralTransformer; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class AutoDetectTransformer implements Transformer { + + private static final ServiceLoader DEFAULT_LOADER = + new ServiceLoader(AutoDetectTransformer.class.getClassLoader()); + + TikaConfig config = TikaConfig.getDefaultConfig(); + MediaTypeRegistry registry = config.getMediaTypeRegistry(); + Detector detector = TikaConfig.getDefaultConfig().getDetector(); + + Transformer fallback = new GeneralTransformer(); + Map<MediaType, Transformer> transformerMap = new HashMap<>(); + + public AutoDetectTransformer() { + this(DEFAULT_LOADER.loadServiceProviders(org.apache.tika.fuzzing.Transformer.class)); + } + + public AutoDetectTransformer(List<Transformer> transformers) { + for (Transformer t : transformers) { + for (MediaType mediaType : t.getSupportedTypes()) { + transformerMap.put(mediaType, t); + } + } + } + + @Override + public Set<MediaType> getSupportedTypes() { + return transformerMap.keySet(); + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException, TikaException { + try (TikaInputStream tis = TikaInputStream.get(is)) { + // Automatically detect the MIME type of the document + Metadata metadata = new Metadata(); + MediaType type = detector.detect(tis, metadata); + Transformer transformer = getTransformer(type); + transformer.transform(tis, os); + } + } + + private Transformer getTransformer(MediaType type) { + if (type == null) { + return fallback; + } + // We always work on the normalised, canonical form + type = registry.normalize(type); + + while (type != null) { + // Try finding a parser for the type + Transformer transformer = transformerMap.get(type); + if (transformer != null) { + return transformer; + } + + // Failing that, try for the parent of the type + type = registry.getSupertype(type); + } + return fallback; + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java new file mode 100644 index 0000000..7e3d083 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Set; + +public interface Transformer { + + /** + * Returns the set of media types supported by this parser when used + * with the given parse context. + * + * @since Apache Tika 1.24.1 + * @return immutable set of media types + */ + Set<MediaType> getSupportedTypes(); + + + void transform(InputStream is, OutputStream os) throws IOException, TikaException; +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java new file mode 100644 index 0000000..faa1383 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.cli; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.fuzzing.AutoDetectTransformer; +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.fuzzing.exceptions.CantFuzzException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.utils.ExceptionUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +/** + * Child process that runs against a single input file + */ +public class FuzzOne { + private static final Logger LOG = LoggerFactory.getLogger(FuzzOne.class); + + static Options OPTIONS; + static { + //By the time this commandline is parsed, there should be both an extracts and an inputDir + Option extracts = new Option("extracts", true, "directory for extract files"); + extracts.setRequired(true); + + + OPTIONS = new Options() + .addOption(Option.builder("i") + .longOpt("inputFile") + .desc("input directory for seed files") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("o") + .longOpt("outputFile") + .desc("output file base") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("m") + .longOpt("timeoutMs") + .desc("timeout in ms -- max time allowed to parse a file") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("n") + .desc("thread id (thread number)") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("p") + .longOpt("perFile") + .desc("number of iterations to run per seed file") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("t") + .longOpt("maxTransformers") + .desc("maximum number of transformers to run per iteration") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("r") + .longOpt("retryId") + .desc("which retry is this") + .hasArg(true) + .required(true) + .build()); + } + Parser parser = new AutoDetectParser(); + + public static void main(String[] args) throws Exception { + FuzzOneConfig config = FuzzOneConfig.parse(args); + FuzzOne fuzzOne = new FuzzOne(); + fuzzOne.execute(config); + } + + private void execute(FuzzOneConfig config) { + Path src = config.inputFile; + Path targetDir = config.outputFileBase; + AutoDetectTransformer transformer = new AutoDetectTransformer(); + for (int i = 0; i < config.perFileIterations; i++) { + try { + String ext = "-"+config.threadNum + "-" + config.retryNum + "-"+i; + fuzz(ext, src, targetDir, transformer, config.timeoutMs); + } catch (IOException e) { + LOG.warn("problem transforming file", e); + } catch (CantFuzzException e) { + LOG.warn("can't fuzz this file "+src, e); + return; + } catch (TikaException e) { + e.printStackTrace(); + } + } + } + + private void fuzz(String ext, Path src, Path targetFileBase, + Transformer transformer, long timeoutMs) throws IOException, TikaException { + + Path target = targetFileBase.getParent().resolve( + targetFileBase.getFileName().toString() +ext); + + try { + transformFile(transformer, src, target); + } catch (Throwable t) { + LOG.warn("failed to transform: " + src.toString()); + Files.delete(target); + throw t; + } + ExecutorService executor = Executors.newSingleThreadExecutor(); + Future<Integer> future = executor.submit(new ParseTask(target)); + + try { + int result = future.get(timeoutMs, TimeUnit.MILLISECONDS); + if (result == 1 && Files.exists(target)) { + LOG.warn("failed to delete target: "+target); + } + } catch (TimeoutException e) { + LOG.warn("timeout exception:"+target); + future.cancel(true); + writeErrFile(target, ".timeout"); + System.exit(1); + } catch (InterruptedException|ExecutionException e) { + LOG.warn("problem parsing "+target, e); + System.exit(1); + } finally { + executor.shutdownNow(); + } + } + + private void writeErrFile(Path target, String ext) { + try { + Path err = target.getParent().resolve(target.getFileName().toString()+ext); + Files.write(err, new byte[0]); + } catch (IOException e) { + LOG.warn("things aren't going right today.", e); + } + } + + private void handleThrowable(Path target, Throwable t) { + + try { + Path errMsg = target.getParent().resolve(target.getFileName().toString()+".stacktrace"); + Files.write(errMsg, ExceptionUtils.getStackTrace(t).getBytes(StandardCharsets.UTF_8)); + } catch (IOException e) { + LOG.warn("things aren't going right today.", t); + } + + } + + private void transformFile(Transformer transformer, Path src, Path target) throws IOException, TikaException { + try (InputStream is = Files.newInputStream(src); OutputStream os = + Files.newOutputStream(target)) { + transformer.transform(is, os); + } + } + + private static class FuzzOneConfig { + static FuzzOneConfig parse(String[] args) throws ParseException { + CommandLineParser parser = new DefaultParser(); + CommandLine commandLine = parser.parse(OPTIONS, args); + FuzzOneConfig config = new FuzzOneConfig(); + config.inputFile = Paths.get(commandLine.getOptionValue("i")); + config.outputFileBase = Paths.get(commandLine.getOptionValue("o")); + config.perFileIterations = Integer.parseInt(commandLine.getOptionValue("p")); + config.maxTransformers = Integer.parseInt(commandLine.getOptionValue("t")); + config.threadNum = Integer.parseInt(commandLine.getOptionValue("n")); + config.retryNum = Integer.parseInt(commandLine.getOptionValue("r")); + config.timeoutMs = Integer.parseInt(commandLine.getOptionValue("m")); + return config; + } + + private Path inputFile; + private Path outputFileBase; + int perFileIterations; + int maxTransformers; + int threadNum; + int retryNum; + long timeoutMs; + + } + + private class ParseTask implements Callable<Integer> { + private final Path target; + public ParseTask(Path target) { + this.target = target; + } + + /** + * + * @return 1 if success + * @throws Exception + */ + @Override + public Integer call() throws Exception { + boolean success = false; + try (InputStream is = Files.newInputStream(target)) { + LOG.debug("parsing "+target); + parser.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + success = true; + } catch (TikaException e) { + if (e.getCause() != null && e.getCause() instanceof RuntimeException) { + //handleThrowable(target, e.getCause()); + success = true; + } else { + success = true; + } + } catch (SAXException|IOException e) { + success = true; + } catch (Throwable t) { + handleThrowable(target, t); + } finally { + if (success) { + try { + Files.delete(target); + } catch (IOException e) { + LOG.warn("couldn't delete: "+target.toAbsolutePath()); + } + } else { + LOG.info("FOUND PROBLEM: "+target); + } + } + return success ? 1 : 0; + } + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java new file mode 100644 index 0000000..3857a9a --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.cli; + +import org.apache.tika.utils.ProcessUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +public class FuzzingCLI { + private static final Logger LOG = LoggerFactory.getLogger(FuzzingCLI.class); + + private static final Path POISON = Paths.get(""); + + private int maxFiles = -1; + + public static void main (String[] args) throws Exception { + FuzzingCLIConfig config = FuzzingCLIConfig.parse(args); + if (config.getMaxTransformers() == 0) { + LOG.warn("max transformers == 0!"); + } + if (! Files.isDirectory(config.inputDir)) { + throw new IllegalArgumentException("input directory doesn't exist: " + config.inputDir); + } + FuzzingCLI fuzzingCLI = new FuzzingCLI(); + Files.createDirectories(config.getOutputDirectory()); + fuzzingCLI.execute(config); + } + + private void execute(FuzzingCLIConfig config) { + ArrayBlockingQueue<Path> q = new ArrayBlockingQueue(10000); + ExecutorService executorService = Executors.newFixedThreadPool(config.getNumThreads()+1); + ExecutorCompletionService executorCompletionService = new ExecutorCompletionService(executorService); + FileAdder fileAdder = new FileAdder(config.getInputDirectory(), config.getNumThreads(), q); + executorCompletionService.submit(fileAdder); + for (int i = 0; i < config.numThreads; i++) { + executorCompletionService.submit(new Fuzzer(q, config)); + } + int finished = 0; + while (finished < config.getNumThreads()+1) { + Future<Integer> future = null; + try { + future = executorCompletionService.poll(1, TimeUnit.SECONDS); + if (future != null) { + future.get(); + finished++; + } + } catch (InterruptedException | ExecutionException e) { + e.printStackTrace(); + break; + } + } + executorService.shutdownNow(); + } + + private static class Fuzzer implements Callable<Integer> { + static AtomicInteger COUNTER = new AtomicInteger(); + private final int threadId = COUNTER.getAndIncrement(); + private final ArrayBlockingQueue<Path> q; + private final FuzzingCLIConfig config; + public Fuzzer(ArrayBlockingQueue<Path> q, FuzzingCLIConfig config) { + this.q = q; + this.config = config; + } + + @Override + public Integer call() throws Exception { + while (true) { + Path p = q.take(); + if (p.equals(POISON)) { + LOG.debug("Thread "+threadId + " stopping"); + return 1; + } + boolean success = false; + int tries = 0; + while (! success && tries < config.getRetries()) { + if (tries > 0) { + LOG.warn("Retrying ("+tries+") "+p); + } + success = fuzzIt(config, p, tries); + tries++; + } + } + } + + private boolean fuzzIt(FuzzingCLIConfig config, Path p, int retryId) { + //the target files should be flattened so that + //problematic files are all in one directory...may rethink this option later + Path target = config.getOutputDirectory().resolve( + p.getFileName()); + String cp = System.getProperty("java.class.path"); + + String[] args = new String[] { + "java", + "-ea", + "-cp", + ProcessUtils.escapeCommandLine(cp), + "org.apache.tika.fuzzing.cli.FuzzOne", + "-i", + ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()), + "-o", + ProcessUtils.escapeCommandLine(target.toAbsolutePath().toString()), + "-p", + Integer.toString(config.getPerFileIterations()), + "-t", + Integer.toString(config.getMaxTransformers()), + "-n", + Integer.toString(threadId), + "-r", + Integer.toString(retryId), + "-m", + Long.toString(config.getTimeoutMs()) + }; + ProcessBuilder pb = new ProcessBuilder(args); + pb.inheritIO(); + Process process = null; + boolean success = false; + try { + process = pb.start(); + } catch (IOException e) { + LOG.warn("problem starting process", e); + } + try { + long totalTime = 2*config.getTimeoutMs()+config.getPerFileIterations(); + success = process.waitFor(totalTime, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + LOG.warn("problem waiting for process to finish", e); + } finally { + if (process.isAlive()) { + LOG.warn("process still alive for " + target.toAbsolutePath()); + process.destroyForcibly(); + } + try { + int exitValue = process.exitValue(); + if (exitValue != 0) { + success = false; + LOG.warn("bad exit value for " + target.toAbsolutePath()); + } + } catch (IllegalThreadStateException e) { + success = false; + LOG.warn("not exited"); + process.destroyForcibly(); + } + } + return success; + } + + } + + private class FileAdder implements Callable<Integer> { + private final Path inputDir; + private final int numThreads; + private final ArrayBlockingQueue<Path> queue; + private int added = 0; + public FileAdder(Path inputDirectory, int numThreads, ArrayBlockingQueue<Path> queue) { + this.inputDir = inputDirectory; + this.numThreads = numThreads; + this.queue = queue; + } + + @Override + public Integer call() throws Exception { + Files.walkFileTree(inputDir, new DirWalker()); + for (int i = 0; i < numThreads; i++) { + queue.add(POISON); + } + return 1; + } + + private class DirWalker implements FileVisitor<Path> { + + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (maxFiles > -1 && added >= maxFiles) { + LOG.info("hit maxfiles; file crawler is stopping early"); + return FileVisitResult.TERMINATE; + } + + try { + boolean offered = queue.offer(file, 10, TimeUnit.MINUTES); + if (offered) { + added++; + return FileVisitResult.CONTINUE; + } else { + LOG.error("couldn't add a file after 10 minutes!"); + return FileVisitResult.TERMINATE; + } + } catch (InterruptedException e) { + e.printStackTrace(); + return FileVisitResult.TERMINATE; + } + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + } + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java new file mode 100644 index 0000000..324b934 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.cli; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; + +import java.nio.file.Path; +import java.nio.file.Paths; + +public class FuzzingCLIConfig { + + private static final int DEFAULT_NUM_THREADS = 4; + private static final int DEFAULT_NUM_ITERATIONS = 1000; + //allow all transformers to operate + private static final int DEFAULT_MAX_TRANSFORMERS = -1; + + private static final long DEFAULT_TIMEOUT_MS = 120000; + + private static final int DEFAULT_RETRIES = 2; + + static Options OPTIONS; + static { + //By the time this commandline is parsed, there should be both an extracts and an inputDir + Option extracts = new Option("extracts", true, "directory for extract files"); + extracts.setRequired(true); + + + OPTIONS = new Options() + .addOption(Option.builder("i") + .longOpt("inputDir") + .desc("input directory for seed files") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("o") + .longOpt("outputDir") + .desc("output directory for files that triggered problems") + .hasArg(true) + .required(true) + .build()) + .addOption(Option.builder("n") + .longOpt("numThreads") + .desc("number of threads") + .hasArg(true) + .required(false) + .build()) + .addOption(Option.builder("p") + .longOpt("perFile") + .desc("number of iterations to run per seed file") + .hasArg(true) + .required(false) + .build()) + .addOption(Option.builder("t") + .longOpt("maxTransformers") + .desc("maximum number of transformers to run per iteration") + .hasArg(true) + .required(false) + .build()) + .addOption(Option.builder("m") + .longOpt("timeoutMs") + .desc("timeout in ms -- max time allowed to parse a file") + .hasArg(true) + .required(false) + .build()) + .addOption(Option.builder("r") + .longOpt("retries") + .desc("number of times to retry a seed file if there's a catastrophic failure") + .hasArg(true) + .required(false) + .build()); + + } + + public static FuzzingCLIConfig parse(String[] args) throws ParseException { + CommandLineParser parser = new DefaultParser(); + CommandLine commandLine = parser.parse(OPTIONS, args); + FuzzingCLIConfig config = new FuzzingCLIConfig(); + config.inputDir = Paths.get(commandLine.getOptionValue("i")); + config.outputDir = Paths.get(commandLine.getOptionValue("o")); + config.numThreads = (commandLine.hasOption("n")) ? + Integer.parseInt(commandLine.getOptionValue("n")) : + DEFAULT_NUM_THREADS; + config.perFileIterations = (commandLine.hasOption("p")) ? + Integer.parseInt(commandLine.getOptionValue("p")) : + DEFAULT_NUM_ITERATIONS; + config.maxTransformers = (commandLine.hasOption("t")) ? + Integer.parseInt(commandLine.getOptionValue("t")) : + DEFAULT_MAX_TRANSFORMERS; + config.timeoutMS = (commandLine.hasOption("m")) ? + Integer.parseInt(commandLine.getOptionValue("m")) : + DEFAULT_TIMEOUT_MS; + config.retries = (commandLine.hasOption("r")) ? + Integer.parseInt(commandLine.getOptionValue("r")) : + DEFAULT_RETRIES; + return config; + } + + + int numThreads; + //number of variants tried per file + int perFileIterations; + //maxTransformers per file + int maxTransformers; + + //max time allowed to process each file in milliseconds + long timeoutMS; + + //times to retry a seed file after a catastrophic failure + int retries; + Path inputDir; + Path outputDir; + + + public int getNumThreads() { + return numThreads; + } + + public Path getInputDirectory() { + return inputDir; + } + + public Path getOutputDirectory() { + return outputDir; + } + + public int getMaxTransformers() { + return maxTransformers; + } + + public long getTimeoutMs() { + return timeoutMS; + } + + public int getPerFileIterations() { + return perFileIterations; + } + + public int getRetries() { + return retries; + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java new file mode 100644 index 0000000..3540822 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.exceptions; + +import org.apache.tika.exception.TikaException; + +public class CantFuzzException extends TikaException { + public CantFuzzException(String msg) { + super(msg); + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java new file mode 100644 index 0000000..ff26f7f --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.general; + +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.mime.MediaType; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Random; +import java.util.Set; + +public class ByteDeleter implements Transformer { + Random random = new Random(); + float percentDeleted = 0.01f; + + static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); + + @Override + public Set<MediaType> getSupportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException { + int c = is.read(); + while (c != -1) { + if (random.nextFloat() >= percentDeleted) { + os.write(c); + } else { + //skip + } + c = is.read(); + } + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java new file mode 100644 index 0000000..74e9b5f --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.general; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.mime.MediaType; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Random; +import java.util.Set; + +public class ByteFlipper implements Transformer { + + //TODO add something about protecting first x bytes? + private Random random = new Random(); + private float percentCorrupt = 0.01f; + + static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); + + @Override + public Set<MediaType> getSupportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException { + //TODO -- don't load the full thing into memory + byte[] input = IOUtils.toByteArray(is); + if (input.length == 0) { + return; + } + byte[] singleByte = new byte[1]; + //make sure that there's at least one change, even in short files + int atLeastOneIndex = random.nextInt(input.length); + + for (int i = 0; i < input.length; i++) { + if (random.nextFloat() <= percentCorrupt || i == atLeastOneIndex) { + random.nextBytes(singleByte); + os.write(singleByte[0]); + } else { + os.write(input[i]); + } + } + } + + public void setPercentCorrupt(float percentCorrupt) { + percentCorrupt = percentCorrupt; + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java new file mode 100644 index 0000000..2dbfec8 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.general; + +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.io.IOUtils; +import org.apache.tika.mime.MediaType; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.Random; +import java.util.Set; + +public class ByteInjector implements Transformer { + Random random = new Random(); + float injectionFrequency = 0.01f; + int maxSpan = 100; + static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); + + @Override + public Set<MediaType> getSupportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException { + //TODO -- don't load the full thing into memory + byte[] input = IOUtils.toByteArray(is); + int numInjections = (int) Math.floor((double)injectionFrequency*(double)input.length); + //at least one injection + numInjections = numInjections == 0 ? 1 : numInjections; + int[] starts = new int[numInjections]; + if (numInjections > 1) { + for (int i = 0; i < numInjections; i++) { + starts[i] = random.nextInt(input.length - 1); + } + } else { + starts[0] = 0; + } + Arrays.sort(starts); + int startIndex = 0; + + for (int i = 0; i < input.length; i++) { + os.write(input[i]); + if (startIndex < starts.length && starts[startIndex] == i) { + inject(os); + startIndex++; + } + } + } + + private void inject(OutputStream os) throws IOException { + int len = random.nextInt(maxSpan); + byte[] randBytes = new byte[len]; + random.nextBytes(randBytes); + os.write(randBytes); + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java new file mode 100644 index 0000000..803784e --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.general; + +import org.apache.commons.compress.utils.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.mime.MediaType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Random; +import java.util.Set; + +public class GeneralTransformer implements Transformer { + + private static final Logger LOG = LoggerFactory.getLogger(GeneralTransformer.class); + + Random random = new Random(); + + private final int maxTransforms; + private final Transformer[] transformers; + private final Set<MediaType> supportedTypes; + public GeneralTransformer() { + this(new ByteDeleter(), new ByteFlipper(), + new ByteInjector(), new Truncator(), new SpanSwapper()); + } + + public GeneralTransformer(Transformer ... transformers) { + this(transformers.length, transformers); + } + + public GeneralTransformer(int maxTransforms, Transformer ... transformers) { + this.maxTransforms = (maxTransforms < 0) ? transformers.length : maxTransforms; + this.transformers = transformers; + Set<MediaType> tmpTypes = new HashSet<>(); + for (Transformer transformer : transformers) { + tmpTypes.addAll(transformer.getSupportedTypes()); + } + supportedTypes = Collections.unmodifiableSet(tmpTypes); + } + + @Override + public Set<MediaType> getSupportedTypes() { + return supportedTypes; + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException, TikaException { + //used for debugging + if (maxTransforms == 0) { + return; + } + int transformerCount = (maxTransforms == 1) ? 1 : 1 + random.nextInt(maxTransforms); + int[] transformerIndices = new int[transformerCount]; + for (int i = 0; i < transformerCount; i++) { + transformerIndices[i] = random.nextInt(transformerCount); + } + //TODO -- make this actually streaming + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + IOUtils.copy(is, bos); + for (int i = 0; i < transformerIndices.length-1; i++) { + byte[] bytes = bos.toByteArray(); + bos = new ByteArrayOutputStream(); + transformers[transformerIndices[i]].transform( + new ByteArrayInputStream(bytes), bos); + bos.flush(); + if (bos.toByteArray().length == 0) { + LOG.warn("zero length: "+transformers[transformerIndices[i]]); + } + } + os.write(bos.toByteArray()); + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java new file mode 100644 index 0000000..e2bc16c --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.general; + +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.io.IOUtils; +import org.apache.tika.mime.MediaType; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Random; +import java.util.Set; + +/** + * randomly swaps spans from the input + * + */ +public class SpanSwapper implements Transformer { + + Random random = new Random(); + private float swapProbability = 0.01f; + int maxSpanLength = 10000; + + static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); + + @Override + public Set<MediaType> getSupportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException { + byte[] input = IOUtils.toByteArray(is); + int numSwaps = (int) Math.floor(swapProbability*input.length); + //at least one swap + numSwaps = numSwaps == 0 ? 1 : numSwaps; + byte[] ret = new byte[input.length]; + System.arraycopy(input, 0, ret, 0, input.length); + for (int i = 0; i < numSwaps; i++) { + ret = swap(ret); + } + os.write(ret); + } + + private byte[] swap(byte[] ret) { + if (ret.length == 0) { + return new byte[0]; + } + int srcStart = random.nextInt(ret.length); + int targStart = random.nextInt(ret.length); + //these spans can overlap; + + int len = random.nextInt(maxSpanLength); + int maxStart = Math.max(srcStart, targStart); + len = (len+maxStart < ret.length) ? len : + ret.length-maxStart; + + byte[] landingBytes = new byte[len]; + //copy the landing zone + System.arraycopy(ret, targStart, landingBytes, 0, len); + //now copy the src onto the targ + System.arraycopy(ret, srcStart, ret, targStart, len); + //now copy the targ over to the src + System.arraycopy(landingBytes, 0, ret, srcStart, len); + return ret; + } + +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java new file mode 100644 index 0000000..209810c --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.general; + +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.io.IOUtils; +import org.apache.tika.mime.MediaType; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Random; +import java.util.Set; + +public class Truncator implements Transformer { + + Random random = new Random(); + static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); + + @Override + public Set<MediaType> getSupportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void transform(InputStream is, OutputStream os) throws IOException { + //TODO -- redo streaming + byte[] input = IOUtils.toByteArray(is); + if (input.length == 0) { + return; + } + int len = 1 + random.nextInt(input.length); + //at least one + if (len >= input.length) { + len = input.length-2; + if (len < 0) { + len = 0; + } + } + + byte[] ret = new byte[len]; + System.arraycopy(input, 0, ret, 0, len); + os.write(ret); + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java new file mode 100644 index 0000000..0484c93 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java @@ -0,0 +1,1283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.pdf; + +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSBoolean; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSDocument; +import org.apache.pdfbox.cos.COSFloat; +import org.apache.pdfbox.cos.COSInteger; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSNull; +import org.apache.pdfbox.cos.COSNumber; +import org.apache.pdfbox.cos.COSObject; +import org.apache.pdfbox.cos.COSObjectKey; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.cos.COSUpdateInfo; +import org.apache.pdfbox.cos.ICOSVisitor; +import org.apache.pdfbox.io.IOUtils; +import org.apache.pdfbox.io.RandomAccessInputStream; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.pdfparser.PDFXRefStream; +import org.apache.pdfbox.pdfwriter.COSStandardOutputStream; +import org.apache.pdfbox.pdfwriter.COSWriter; +import org.apache.pdfbox.pdfwriter.COSWriterXRefEntry; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; +import org.apache.pdfbox.pdmodel.fdf.FDFDocument; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.COSFilterInputStream; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface; +import org.apache.pdfbox.util.Hex; + +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.SequenceInputStream; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +public class EvilCOSWriter implements ICOSVisitor, Closeable { + /** + * The dictionary open token. + */ + public static final byte[] DICT_OPEN = "<<".getBytes(StandardCharsets.US_ASCII); + /** + * The dictionary close token. + */ + public static final byte[] DICT_CLOSE = ">>".getBytes(StandardCharsets.US_ASCII); + /** + * space character. + */ + public static final byte[] SPACE = {' '}; + /** + * The start to a PDF comment. + */ + public static final byte[] COMMENT = {'%'}; + + /** + * The output version of the PDF. + */ + public static final byte[] VERSION = "PDF-1.4".getBytes(StandardCharsets.US_ASCII); + /** + * Garbage bytes used to create the PDF header. + */ + public static final byte[] GARBAGE = new byte[]{(byte) 0xf6, (byte) 0xe4, (byte) 0xfc, (byte) 0xdf}; + /** + * The EOF constant. + */ + public static final byte[] EOF = "%%EOF".getBytes(StandardCharsets.US_ASCII); + // pdf tokens + + /** + * The reference token. + */ + public static final byte[] REFERENCE = "R".getBytes(StandardCharsets.US_ASCII); + /** + * The XREF token. + */ + public static final byte[] XREF = "xref".getBytes(StandardCharsets.US_ASCII); + /** + * The xref free token. + */ + public static final byte[] XREF_FREE = "f".getBytes(StandardCharsets.US_ASCII); + /** + * The xref used token. + */ + public static final byte[] XREF_USED = "n".getBytes(StandardCharsets.US_ASCII); + /** + * The trailer token. + */ + public static final byte[] TRAILER = "trailer".getBytes(StandardCharsets.US_ASCII); + /** + * The start xref token. + */ + public static final byte[] STARTXREF = "startxref".getBytes(StandardCharsets.US_ASCII); + /** + * The starting object token. + */ + public static final byte[] OBJ = "obj".getBytes(StandardCharsets.US_ASCII); + /** + * The end object token. + */ + public static final byte[] ENDOBJ = "endobj".getBytes(StandardCharsets.US_ASCII); + /** + * The array open token. + */ + public static final byte[] ARRAY_OPEN = "[".getBytes(StandardCharsets.US_ASCII); + /** + * The array close token. + */ + public static final byte[] ARRAY_CLOSE = "]".getBytes(StandardCharsets.US_ASCII); + /** + * The open stream token. + */ + public static final byte[] STREAM = "stream".getBytes(StandardCharsets.US_ASCII); + /** + * The close stream token. + */ + public static final byte[] ENDSTREAM = "endstream".getBytes(StandardCharsets.US_ASCII); + + private final NumberFormat formatXrefOffset = new DecimalFormat("0000000000", + DecimalFormatSymbols.getInstance(Locale.US)); + + // the decimal format for the xref object generation number data + private final NumberFormat formatXrefGeneration = new DecimalFormat("00000", + DecimalFormatSymbols.getInstance(Locale.US)); + + // the stream where we create the pdf output + private OutputStream output; + + // the stream used to write standard cos data + private COSStandardOutputStream standardOutput; + + // the start position of the x ref section + private long startxref = 0; + + // the current object number + private long number = 0; + + // maps the object to the keys generated in the writer + // these are used for indirect references in other objects + //A hashtable is used on purpose over a hashmap + //so that null entries will not get added. + @SuppressWarnings({"squid:S1149"}) + private final Map<COSBase, COSObjectKey> objectKeys = new Hashtable<>(); + + private final Map<COSObjectKey, COSBase> keyObject = new HashMap<>(); + + // the list of x ref entries to be made so far + private final List<COSWriterXRefEntry> xRefEntries = new ArrayList<>(); + private final Set<COSBase> objectsToWriteSet = new HashSet<>(); + + //A list of objects to write. + private final Deque<COSBase> objectsToWrite = new LinkedList<>(); + + //a list of objects already written + private final Set<COSBase> writtenObjects = new HashSet<>(); + + //An 'actual' is any COSBase that is not a COSObject. + //need to keep a list of the actuals that are added + //as well as the objects because there is a problem + //when adding a COSObject and then later adding + //the actual for that object, so we will track + //actuals separately. + private final Set<COSBase> actualsAdded = new HashSet<>(); + + private COSObjectKey currentObjectKey = null; + private PDDocument pdDocument = null; + private FDFDocument fdfDocument = null; + private boolean willEncrypt = false; + + // signing + private boolean incrementalUpdate = false; + private boolean reachedSignature = false; + private long signatureOffset; + private long signatureLength; + private long byteRangeOffset; + private long byteRangeLength; + private RandomAccessRead incrementalInput; + private OutputStream incrementalOutput; + private SignatureInterface signatureInterface; + private byte[] incrementPart; + private COSArray byteRangeArray; + + private final PDFTransformerConfig config; + private final Random random = new Random(); + /** + * COSWriter constructor. + * + * @param outputStream The output stream to write the PDF. It will be closed when this object is + * closed. + */ + public EvilCOSWriter(OutputStream outputStream, PDFTransformerConfig config) { + setOutput(outputStream); + setStandardOutput(new COSStandardOutputStream(output)); + this.config = config; + } + + private void prepareIncrement(PDDocument doc) throws IOException { + if (doc != null) { + COSDocument cosDoc = doc.getDocument(); + + Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable(); + Set<COSObjectKey> keySet = xrefTable.keySet(); + long highestNumber = doc.getDocument().getHighestXRefObjectNumber(); + for (COSObjectKey cosObjectKey : keySet) { + COSBase object = cosDoc.getObjectFromPool(cosObjectKey).getObject(); + if (object != null && cosObjectKey != null && !(object instanceof COSNumber)) { + objectKeys.put(object, cosObjectKey); + keyObject.put(cosObjectKey, object); + } + + if (cosObjectKey != null) { + long num = cosObjectKey.getNumber(); + if (num > highestNumber) { + highestNumber = num; + } + } + } + setNumber(highestNumber); + } + } + + /** + * add an entry in the x ref table for later dump. + * + * @param entry The new entry to add. + */ + protected void addXRefEntry(COSWriterXRefEntry entry) { + getXRefEntries().add(entry); + } + + /** + * This will close the stream. + * + * @throws IOException If the underlying stream throws an exception. + */ + @Override + public void close() throws IOException { + if (getStandardOutput() != null) { + getStandardOutput().close(); + } + if (incrementalOutput != null) { + incrementalOutput.close(); + } + } + + /** + * This will get the current object number. + * + * @return The current object number. + */ + protected long getNumber() { + return number; + } + + /** + * This will get all available object keys. + * + * @return A map of all object keys. + */ + public Map<COSBase, COSObjectKey> getObjectKeys() { + return objectKeys; + } + + /** + * This will get the output stream. + * + * @return The output stream. + */ + protected java.io.OutputStream getOutput() { + return output; + } + + /** + * This will get the standard output stream. + * + * @return The standard output stream. + */ + protected COSStandardOutputStream getStandardOutput() { + return standardOutput; + } + + /** + * This will get the current start xref. + * + * @return The current start xref. + */ + protected long getStartxref() { + return startxref; + } + + /** + * This will get the xref entries. + * + * @return All available xref entries. + */ + protected List<COSWriterXRefEntry> getXRefEntries() { + return xRefEntries; + } + + /** + * This will set the current object number. + * + * @param newNumber The new object number. + */ + protected void setNumber(long newNumber) { + number = newNumber; + + } + + /** + * This will set the output stream. + * + * @param newOutput The new output stream. + */ + private void setOutput(OutputStream newOutput) { + output = newOutput; + } + + /** + * This will set the standard output stream. + * + * @param newStandardOutput The new standard output stream. + */ + private void setStandardOutput(COSStandardOutputStream newStandardOutput) { + standardOutput = newStandardOutput; + } + + /** + * This will set the start xref. + * + * @param newStartxref The new start xref attribute. + */ + protected void setStartxref(long newStartxref) { + startxref = newStartxref; + } + + /** + * This will write the body of the document. + * + * @param doc The document to write the body for. + * @throws IOException If there is an error writing the data. + */ + protected void doWriteBody(COSDocument doc) throws IOException { + COSDictionary trailer = doc.getTrailer(); + COSDictionary root = trailer.getCOSDictionary(COSName.ROOT); + COSDictionary info = trailer.getCOSDictionary(COSName.INFO); + COSDictionary encrypt = trailer.getCOSDictionary(COSName.ENCRYPT); + if (root != null) { + addObjectToWrite(root); + } + if (info != null) { + addObjectToWrite(info); + } + + doWriteObjects(); + willEncrypt = false; + if (encrypt != null) { + addObjectToWrite(encrypt); + } + + doWriteObjects(); + } + + private void doWriteObjects() throws IOException { + while (objectsToWrite.size() > 0) { + COSBase nextObject = objectsToWrite.removeFirst(); + objectsToWriteSet.remove(nextObject); + doWriteObject(nextObject); + } + } + + private void addObjectToWrite(COSBase object) { + COSBase actual = object; + if (actual instanceof COSObject) { + actual = ((COSObject) actual).getObject(); + } + + if (!writtenObjects.contains(object) && + !objectsToWriteSet.contains(object) && + !actualsAdded.contains(actual)) { + COSBase cosBase = null; + COSObjectKey cosObjectKey = null; + if (actual != null) { + cosObjectKey = objectKeys.get(actual); + } + if (cosObjectKey != null) { + cosBase = keyObject.get(cosObjectKey); + } + if (actual != null && objectKeys.containsKey(actual) + && object instanceof COSUpdateInfo && !((COSUpdateInfo) object).isNeedToBeUpdated() + && cosBase instanceof COSUpdateInfo && !((COSUpdateInfo) cosBase).isNeedToBeUpdated()) { + return; + } + objectsToWrite.add(object); + objectsToWriteSet.add(object); + if (actual != null) { + actualsAdded.add(actual); + } + } + } + + /** + * This will write a COS object. + * + * @param obj The object to write. + * @throws IOException if the output cannot be written + */ + public void doWriteObject(COSBase obj) throws IOException { + writtenObjects.add(obj); + // find the physical reference + currentObjectKey = getObjectKey(obj); + // add a x ref entry + addXRefEntry(new COSWriterXRefEntry(getStandardOutput().getPos(), obj, currentObjectKey)); + // write the object + + long objectNumber = currentObjectKey.getNumber(); + if (config.getRandomizeObjectNumbers()) { + if (random.nextFloat() < 0.99) { + long orig = objectNumber; + objectNumber = 1;//random.nextInt(((int)objectNumber)*2); + } + } + getStandardOutput().write(String.valueOf(objectNumber).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(String.valueOf(currentObjectKey.getGeneration()).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(OBJ); + getStandardOutput().writeEOL(); + // null test added to please Sonar + // TODO: shouldn't all public methods be guarded against passing null. Passing null to most methods will + // fail with an NPE + mutate(obj); + if (obj != null) { + obj.accept(this); + } + getStandardOutput().writeEOL(); + getStandardOutput().write(ENDOBJ); + getStandardOutput().writeEOL(); + } + + private void mutate(COSBase obj) { + //stub + if (obj instanceof COSStream) { + COSStream stream = (COSStream)obj; + //manipulate filters and stream length + } + } + + /** + * This will write the header to the PDF document. + * + * @param doc The document to get the data from. + * @throws IOException If there is an error writing to the stream. + */ + protected void doWriteHeader(COSDocument doc) throws IOException { + String headerString; + if (fdfDocument != null) { + headerString = "%FDF-" + Float.toString(doc.getVersion()); + } else { + headerString = "%PDF-" + Float.toString(doc.getVersion()); + } + getStandardOutput().write(headerString.getBytes(StandardCharsets.ISO_8859_1)); + + getStandardOutput().writeEOL(); + getStandardOutput().write(COMMENT); + getStandardOutput().write(GARBAGE); + getStandardOutput().writeEOL(); + } + + + /** + * This will write the trailer to the PDF document. + * + * @param doc The document to create the trailer for. + * @throws IOException If there is an IOError while writing the document. + */ + protected void doWriteTrailer(COSDocument doc) throws IOException { + getStandardOutput().write(TRAILER); + getStandardOutput().writeEOL(); + + COSDictionary trailer = doc.getTrailer(); + //sort xref, needed only if object keys not regenerated + Collections.sort(getXRefEntries()); + COSWriterXRefEntry lastEntry = getXRefEntries().get(getXRefEntries().size() - 1); + trailer.setLong(COSName.SIZE, lastEntry.getKey().getNumber() + 1); + // Only need to stay, if an incremental update will be performed + if (!incrementalUpdate) { + trailer.removeItem(COSName.PREV); + } + if (!doc.isXRefStream()) { + trailer.removeItem(COSName.XREF_STM); + } + // Remove a checksum if present + trailer.removeItem(COSName.DOC_CHECKSUM); + + COSArray idArray = trailer.getCOSArray(COSName.ID); + if (idArray != null) { + idArray.setDirect(true); + } + + trailer.accept(this); + } + + private void doWriteXRefInc(COSDocument doc, long hybridPrev) throws IOException { + if (doc.isXRefStream() || hybridPrev != -1) { + // the file uses XrefStreams, so we need to update + // it with an xref stream. We create a new one and fill it + // with data available here + + // create a new XRefStrema object + PDFXRefStream pdfxRefStream = new PDFXRefStream(doc); + + // add all entries from the incremental update. + List<COSWriterXRefEntry> xRefEntries2 = getXRefEntries(); + for (COSWriterXRefEntry cosWriterXRefEntry : xRefEntries2) { + pdfxRefStream.addEntry(cosWriterXRefEntry); + } + + COSDictionary trailer = doc.getTrailer(); + if (incrementalUpdate) { + // use previous startXref value as new PREV value + trailer.setLong(COSName.PREV, doc.getStartXref()); + } else { + trailer.removeItem(COSName.PREV); + } + pdfxRefStream.addTrailerInfo(trailer); + // the size is the highest object number+1. we add one more + // for the xref stream object we are going to write + pdfxRefStream.setSize(getNumber() + 2); + + setStartxref(getStandardOutput().getPos()); + COSStream stream2 = pdfxRefStream.getStream(); + doWriteObject(stream2); + } + + if (!doc.isXRefStream() || hybridPrev != -1) { + COSDictionary trailer = doc.getTrailer(); + trailer.setLong(COSName.PREV, doc.getStartXref()); + if (hybridPrev != -1) { + COSName xrefStm = COSName.XREF_STM; + trailer.removeItem(xrefStm); + trailer.setLong(xrefStm, getStartxref()); + } + doWriteXRefTable(); + doWriteTrailer(doc); + } + } + + // writes the "xref" table + private void doWriteXRefTable() throws IOException { + addXRefEntry(COSWriterXRefEntry.getNullEntry()); + + // sort xref, needed only if object keys not regenerated + Collections.sort(getXRefEntries()); + + // remember the position where x ref was written + setStartxref(getStandardOutput().getPos()); + + getStandardOutput().write(XREF); + getStandardOutput().writeEOL(); + // write start object number and object count for this x ref section + // we assume starting from scratch + + Long[] xRefRanges = getXRefRanges(getXRefEntries()); + int xRefLength = xRefRanges.length; + int x = 0; + int j = 0; + while (x < xRefLength && (xRefLength % 2) == 0) { + writeXrefRange(xRefRanges[x], xRefRanges[x + 1]); + + for (int i = 0; i < xRefRanges[x + 1]; ++i) { + writeXrefEntry(xRefEntries.get(j++)); + } + x += 2; + } + } + + /** + * Write an incremental update for a non signature case. This can be used for e.g. augmenting + * signatures. + * + * @throws IOException + */ + private void doWriteIncrement() throws IOException { + // write existing PDF + IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput); + // write the actual incremental update + incrementalOutput.write(((ByteArrayOutputStream) output).toByteArray()); + } + + private void doWriteSignature() throws IOException { + // calculate the ByteRange values + long inLength = incrementalInput.length(); + long beforeLength = signatureOffset; + long afterOffset = signatureOffset + signatureLength; + long afterLength = getStandardOutput().getPos() - (inLength + signatureLength) - (signatureOffset - inLength); + + String byteRange = "0 " + beforeLength + " " + afterOffset + " " + afterLength + "]"; + + // Assign the values to the actual COSArray, so that the user can access it before closing + byteRangeArray.set(0, COSInteger.ZERO); + byteRangeArray.set(1, COSInteger.get(beforeLength)); + byteRangeArray.set(2, COSInteger.get(afterOffset)); + byteRangeArray.set(3, COSInteger.get(afterLength)); + + if (byteRange.length() > byteRangeLength) { + throw new IOException("Can't write new byteRange '" + byteRange + + "' not enough space: byteRange.length(): " + byteRange.length() + + ", byteRangeLength: " + byteRangeLength); + } + + // copy the new incremental data into a buffer (e.g. signature dict, trailer) + ByteArrayOutputStream byteOut = (ByteArrayOutputStream) output; + byteOut.flush(); + incrementPart = byteOut.toByteArray(); + + // overwrite the ByteRange in the buffer + byte[] byteRangeBytes = byteRange.getBytes(StandardCharsets.ISO_8859_1); + for (int i = 0; i < byteRangeLength; i++) { + if (i >= byteRangeBytes.length) { + incrementPart[(int) (byteRangeOffset + i - inLength)] = 0x20; // SPACE + } else { + incrementPart[(int) (byteRangeOffset + i - inLength)] = byteRangeBytes[i]; + } + } + + if (signatureInterface != null) { + // data to be signed + final InputStream dataToSign = getDataToSign(); + + // sign the bytes + byte[] signatureBytes = signatureInterface.sign(dataToSign); + writeExternalSignature(signatureBytes); + } + // else signature should created externally and set via writeSignature() + } + + /** + * Return the stream of PDF data to be signed. Clients should use this method only to create + * signatures externally. {@link #write(PDDocument)} method should have been called prior. The + * created signature should be set using {@link #writeExternalSignature(byte[])}. + * <p> + * When {@link SignatureInterface} instance is used, COSWriter obtains and writes the signature + * itself. + * </p> + * + * @return data stream to be signed + * @throws IllegalStateException if PDF is not prepared for external signing + * @throws IOException if input data is closed + */ + public InputStream getDataToSign() throws IOException { + if (incrementPart == null || incrementalInput == null) { + throw new IllegalStateException("PDF not prepared for signing"); + } + // range of incremental bytes to be signed (includes /ByteRange but not /Contents) + int incPartSigOffset = (int) (signatureOffset - incrementalInput.length()); + int afterSigOffset = incPartSigOffset + (int) signatureLength; + int[] range = + { + 0, incPartSigOffset, + afterSigOffset, incrementPart.length - afterSigOffset + }; + + return new SequenceInputStream( + new RandomAccessInputStream(incrementalInput), + new COSFilterInputStream(incrementPart, range)); + } + + /** + * Write externally created signature of PDF data obtained via {@link #getDataToSign()} method. + * + * @param cmsSignature CMS signature byte array + * @throws IllegalStateException if PDF is not prepared for external signing + * @throws IOException if source data stream is closed + */ + public void writeExternalSignature(byte[] cmsSignature) throws IOException { + + if (incrementPart == null || incrementalInput == null) { + throw new IllegalStateException("PDF not prepared for setting signature"); + } + byte[] signatureBytes = Hex.getBytes(cmsSignature); + + // subtract 2 bytes because of the enclosing "<>" + if (signatureBytes.length > signatureLength - 2) { + throw new IOException("Can't write signature, not enough space"); + } + + // overwrite the signature Contents in the buffer + int incPartSigOffset = (int) (signatureOffset - incrementalInput.length()); + System.arraycopy(signatureBytes, 0, incrementPart, incPartSigOffset + 1, signatureBytes.length); + + // write the data to the incremental output stream + IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput); + incrementalOutput.write(incrementPart); + + // prevent further use + incrementPart = null; + } + + private void writeXrefRange(long x, long y) throws IOException { + getStandardOutput().write(String.valueOf(x).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(String.valueOf(y).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().writeEOL(); + } + + private void writeXrefEntry(COSWriterXRefEntry entry) throws IOException { + String offset = formatXrefOffset.format(entry.getOffset()); + String generation = formatXrefGeneration.format(entry.getKey().getGeneration()); + getStandardOutput().write(offset.getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(generation.getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(entry.isFree() ? XREF_FREE : XREF_USED); + getStandardOutput().writeCRLF(); + } + + /** + * check the xref entries and write out the ranges. The format of the + * returned array is exactly the same as the pdf specification. See section + * 7.5.4 of ISO32000-1:2008, example 1 (page 40) for reference. + * <p> + * example: 0 1 2 5 6 7 8 10 + * <p> + * will create a array with follow ranges + * <p> + * 0 3 5 4 10 1 + * <p> + * this mean that the element 0 is followed by two other related numbers + * that represent a cluster of the size 3. 5 is follow by three other + * related numbers and create a cluster of size 4. etc. + * + * @param xRefEntriesList list with the xRef entries that was written + * @return a integer array with the ranges + */ + protected Long[] getXRefRanges(List<COSWriterXRefEntry> xRefEntriesList) { + long last = -2; + long count = 1; + + List<Long> list = new ArrayList<>(); + for (Object object : xRefEntriesList) { + long nr = (int) ((COSWriterXRefEntry) object).getKey().getNumber(); + if (nr == last + 1) { + ++count; + last = nr; + } else if (last == -2) { + last = nr; + } else { + list.add(last - count + 1); + list.add(count); + last = nr; + count = 1; + } + } + // If no new entry is found, we need to write out the last result + if (xRefEntriesList.size() > 0) { + list.add(last - count + 1); + list.add(count); + } + return list.toArray(new Long[list.size()]); + } + + /** + * This will get the object key for the object. + * + * @param obj The object to get the key for. + * @return The object key for the object. + */ + private COSObjectKey getObjectKey(COSBase obj) { + COSBase actual = obj; + if (actual instanceof COSObject) { + actual = ((COSObject) obj).getObject(); + } + // PDFBOX-4540: because objectKeys is accessible from outside, it is possible + // that a COSObject obj is already in the objectKeys map. + COSObjectKey key = objectKeys.get(obj); + if (key == null && actual != null) { + key = objectKeys.get(actual); + } + if (key == null) { + setNumber(getNumber() + 1); + key = new COSObjectKey(getNumber(), 0); + objectKeys.put(obj, key); + if (actual != null) { + objectKeys.put(actual, key); + } + } + return key; + } + + @Override + public Object visitFromArray(COSArray obj) throws IOException { + int count = 0; + getStandardOutput().write(ARRAY_OPEN); + for (Iterator<COSBase> i = obj.iterator(); i.hasNext(); ) { + COSBase current = i.next(); + if (current instanceof COSDictionary) { + if (current.isDirect()) { + visitFromDictionary((COSDictionary) current); + } else { + addObjectToWrite(current); + writeReference(current); + } + } else if (current instanceof COSObject) { + COSBase subValue = ((COSObject) current).getObject(); + if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary || subValue == null) { + // PDFBOX-4308: added willEncrypt to prevent an object + // that is referenced several times from being written + // direct and indirect, thus getting encrypted + // with wrong object number or getting encrypted twice + addObjectToWrite(current); + writeReference(current); + } else { + subValue.accept(this); + } + } else if (current == null) { + COSNull.NULL.accept(this); + } else { + current.accept(this); + } + count++; + if (i.hasNext()) { + if (count % 10 == 0) { + getStandardOutput().writeEOL(); + } else { + getStandardOutput().write(SPACE); + } + } + } + getStandardOutput().write(ARRAY_CLOSE); + getStandardOutput().writeEOL(); + return null; + } + + @Override + public Object visitFromBoolean(COSBoolean obj) throws IOException { + obj.writePDF(getStandardOutput()); + return null; + } + + @Override + public Object visitFromDictionary(COSDictionary obj) throws IOException { + if (!reachedSignature) { + COSBase itemType = obj.getItem(COSName.TYPE); + if (COSName.SIG.equals(itemType) || COSName.DOC_TIME_STAMP.equals(itemType)) { + reachedSignature = true; + } + } + getStandardOutput().write(DICT_OPEN); + getStandardOutput().writeEOL(); + for (Map.Entry<COSName, COSBase> entry : obj.entrySet()) { + COSBase value = entry.getValue(); + if (value != null) { + entry.getKey().accept(this); + getStandardOutput().write(SPACE); + if (value instanceof COSDictionary) { + COSDictionary dict = (COSDictionary) value; + + if (!incrementalUpdate) { + // write all XObjects as direct objects, this will save some size + // PDFBOX-3684: but avoid dictionary that references itself + COSBase item = dict.getItem(COSName.XOBJECT); + if (item != null && !COSName.XOBJECT.equals(entry.getKey())) { + item.setDirect(true); + } + item = dict.getItem(COSName.RESOURCES); + if (item != null && !COSName.RESOURCES.equals(entry.getKey())) { + item.setDirect(true); + } + } + + if (dict.isDirect()) { + // If the object should be written direct, we need + // to pass the dictionary to the visitor again. + visitFromDictionary(dict); + } else { + addObjectToWrite(dict); + writeReference(dict); + } + } else if (value instanceof COSObject) { + COSBase subValue = ((COSObject) value).getObject(); + if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary || subValue == null) { + // PDFBOX-4308: added willEncrypt to prevent an object + // that is referenced several times from being written + // direct and indirect, thus getting encrypted + // with wrong object number or getting encrypted twice + addObjectToWrite(value); + writeReference(value); + } else { + subValue.accept(this); + } + } else { + // If we reach the pdf signature, we need to determinate the position of the + // content and byterange + if (reachedSignature && COSName.CONTENTS.equals(entry.getKey())) { + signatureOffset = getStandardOutput().getPos(); + value.accept(this); + signatureLength = getStandardOutput().getPos() - signatureOffset; + } else if (reachedSignature && COSName.BYTERANGE.equals(entry.getKey())) { + byteRangeArray = (COSArray) entry.getValue(); + byteRangeOffset = getStandardOutput().getPos() + 1; + value.accept(this); + byteRangeLength = getStandardOutput().getPos() - 1 - byteRangeOffset; + reachedSignature = false; + } else { + value.accept(this); + } + } + getStandardOutput().writeEOL(); + + } else { + //then we won't write anything, there are a couple cases + //were the value of an entry in the COSDictionary will + //be a dangling reference that points to nothing + //so we will just not write out the entry if that is the case + } + } + getStandardOutput().write(DICT_CLOSE); + getStandardOutput().writeEOL(); + return null; + } + + @Override + public Object visitFromDocument(COSDocument doc) throws IOException { + if (!incrementalUpdate) { + doWriteHeader(doc); + } else { + // Sometimes the original file will be missing a newline at the end + // In order to avoid having %%EOF the first object on the same line + // as the %%EOF, we put a newline here. If there's already one at + // the end of the file, an extra one won't hurt. PDFBOX-1051 + getStandardOutput().writeCRLF(); + } + + doWriteBody(doc); + + // get the previous trailer + COSDictionary trailer = doc.getTrailer(); + long hybridPrev = -1; + + if (trailer != null) { + hybridPrev = trailer.getLong(COSName.XREF_STM); + } + + if (incrementalUpdate || doc.isXRefStream()) { + doWriteXRefInc(doc, hybridPrev); + } else { + doWriteXRefTable(); + doWriteTrailer(doc); + } + + // write endof + getStandardOutput().write(STARTXREF); + getStandardOutput().writeEOL(); + getStandardOutput().write(String.valueOf(getStartxref()).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().writeEOL(); + getStandardOutput().write(EOF); + getStandardOutput().writeEOL(); + + if (incrementalUpdate) { + if (signatureOffset == 0 || byteRangeOffset == 0) { + doWriteIncrement(); + } else { + doWriteSignature(); + } + } + + return null; + } + + @Override + public Object visitFromFloat(COSFloat obj) throws IOException { + obj.writePDF(getStandardOutput()); + return null; + } + + @Override + public Object visitFromInt(COSInteger obj) throws IOException { + obj.writePDF(getStandardOutput()); + return null; + } + + @Override + public Object visitFromName(COSName obj) throws IOException { + obj.writePDF(getStandardOutput()); + return null; + } + + @Override + public Object visitFromNull(COSNull obj) throws IOException { + obj.writePDF(getStandardOutput()); + return null; + } + + /** + * visitFromObjRef method comment. + * + * @param obj The object that is being visited. + * @throws IOException If there is an exception while visiting this object. + */ + public void writeReference(COSBase obj) throws IOException { + COSObjectKey key = getObjectKey(obj); + getStandardOutput().write(String.valueOf(key.getNumber()).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(String.valueOf(key.getGeneration()).getBytes(StandardCharsets.ISO_8859_1)); + getStandardOutput().write(SPACE); + getStandardOutput().write(REFERENCE); + } + + @Override + public Object visitFromStream(COSStream obj) throws IOException { + if (willEncrypt) { + pdDocument.getEncryption().getSecurityHandler() + .encryptStream(obj, currentObjectKey.getNumber(), currentObjectKey.getGeneration()); + } + + InputStream input = null; + try { + // write the stream content + visitFromDictionary(obj); + getStandardOutput().write(STREAM); + getStandardOutput().writeCRLF(); + + input = obj.createRawInputStream(); + IOUtils.copy(input, getStandardOutput()); + + getStandardOutput().writeCRLF(); + getStandardOutput().write(ENDSTREAM); + getStandardOutput().writeEOL(); + return null; + } finally { + if (input != null) { + input.close(); + } + } + } + + @Override + public Object visitFromString(COSString obj) throws IOException { + if (willEncrypt) { + pdDocument.getEncryption().getSecurityHandler().encryptString( + obj, + currentObjectKey.getNumber(), + currentObjectKey.getGeneration()); + } + COSWriter.writeString(obj, getStandardOutput()); + return null; + } + + /** + * This will write the pdf document. + * + * @param doc The document to write. + * @throws IOException If an error occurs while generating the data. + */ + public void write(COSDocument doc) throws IOException { + PDDocument pdDoc = new PDDocument(doc); + write(pdDoc); + } + + /** + * This will write the pdf document. If signature should be created externally, + * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method. + * + * @param doc The document to write. + * @throws IOException If an error occurs while generating the data. + */ + public void write(PDDocument doc) throws IOException { + write(doc, null); + } + + /** + * This will write the pdf document. If signature should be created externally, + * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method. + * + * @param doc The document to write. + * @param signInterface class to be used for signing; {@code null} if external signing would be performed + * or there will be no signing at all + * @throws IOException If an error occurs while generating the data. + * @throws IllegalStateException If the document has an encryption dictionary but no protection + * policy. + */ + public void write(PDDocument doc, SignatureInterface signInterface) throws IOException { + Long idTime = doc.getDocumentId() == null ? System.currentTimeMillis() : + doc.getDocumentId(); + + pdDocument = doc; + signatureInterface = signInterface; + + if (incrementalUpdate) { + prepareIncrement(doc); + } + + // if the document says we should remove encryption, then we shouldn't encrypt + if (doc.isAllSecurityToBeRemoved()) { + willEncrypt = false; + // also need to get rid of the "Encrypt" in the trailer so readers + // don't try to decrypt a document which is not encrypted + COSDocument cosDoc = doc.getDocument(); + COSDictionary trailer = cosDoc.getTrailer(); + trailer.removeItem(COSName.ENCRYPT); + } else { + if (pdDocument.getEncryption() != null) { + if (!incrementalUpdate) { + SecurityHandler securityHandler = pdDocument.getEncryption().getSecurityHandler(); + if (!securityHandler.hasProtectionPolicy()) { + throw new IllegalStateException("PDF contains an encryption dictionary, please remove it with " + + "setAllSecurityToBeRemoved() or set a protection policy with protect()"); + } + securityHandler.prepareDocumentForEncryption(pdDocument); + } + willEncrypt = true; + } else { + willEncrypt = false; + } + } + + COSDocument cosDoc = pdDocument.getDocument(); + COSDictionary trailer = cosDoc.getTrailer(); + COSArray idArray; + boolean missingID = true; + COSBase base = trailer.getDictionaryObject(COSName.ID); + if (base instanceof COSArray) { + idArray = (COSArray) base; + if (idArray.size() == 2) { + missingID = false; + } + } else { + idArray = new COSArray(); + } + if (missingID || incrementalUpdate) { + MessageDigest md5; + try { + md5 = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + // should never happen + throw new RuntimeException(e); + } + + // algorithm says to use time/path/size/values in doc to generate the id. + // we don't have path or size, so do the best we can + md5.update(Long.toString(idTime).getBytes(StandardCharsets.ISO_8859_1)); + + COSDictionary info = trailer.getCOSDictionary(COSName.INFO); + if (info != null) { + for (COSBase cosBase : info.getValues()) { + md5.update(cosBase.toString().getBytes(StandardCharsets.ISO_8859_1)); + } + } + // reuse origin documentID if available as first value + COSString firstID = missingID ? new COSString(md5.digest()) : (COSString) idArray.get(0); + // it's ok to use the same ID for the second part if the ID is created for the first time + COSString secondID = missingID ? firstID : new COSString(md5.digest()); + idArray = new COSArray(); + idArray.add(firstID); + idArray.add(secondID); + trailer.setItem(COSName.ID, idArray); + } + cosDoc.accept(this); + } + + /** + * This will write the fdf document. + * + * @param doc The document to write. + * @throws IOException If an error occurs while generating the data. + */ + public void write(FDFDocument doc) throws IOException { + fdfDocument = doc; + willEncrypt = false; + COSDocument cosDoc = fdfDocument.getDocument(); + cosDoc.accept(this); + } + + /** + * This will output the given byte getString as a PDF object. + * + * @param string COSString to be written + * @param output The stream to write to. + * @throws IOException If there is an error writing to the stream. + */ + public static void writeString(COSString string, OutputStream output) throws IOException { + writeString(string.getBytes(), string.getForceHexForm(), output); + } + + /** + * This will output the given text/byte getString as a PDF object. + * + * @param bytes byte array representation of a string to be written + * @param output The stream to write to. + * @throws IOException If there is an error writing to the stream. + */ + public static void writeString(byte[] bytes, OutputStream output) throws IOException { + writeString(bytes, false, output); + } + + /** + * This will output the given text/byte string as a PDF object. + * + * @param output The stream to write to. + * @throws IOException If there is an error writing to the stream. + */ + private static void writeString(byte[] bytes, boolean forceHex, OutputStream output) + throws IOException { + // check for non-ASCII characters + boolean isASCII = true; + if (!forceHex) { + for (byte b : bytes) { + // if the byte is negative then it is an eight bit byte and is outside the ASCII range + if (b < 0) { + isASCII = false; + break; + } + // PDFBOX-3107 EOL markers within a string are troublesome + if (b == 0x0d || b == 0x0a) { + isASCII = false; + break; + } + } + } + + if (isASCII && !forceHex) { + // write ASCII string + output.write('('); + for (byte b : bytes) { + switch (b) { + case '(': + case ')': + case '\\': + output.write('\\'); + output.write(b); + break; + default: + output.write(b); + break; + } + } + output.write(')'); + } else { + // write hex string + output.write('<'); + Hex.writeHexBytes(bytes, output); + output.write('>'); + } + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java new file mode 100644 index 0000000..ab7fa11 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.pdf; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.fuzzing.Transformer; +import org.apache.tika.fuzzing.exceptions.CantFuzzException; +import org.apache.tika.mime.MediaType; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Set; + +public class PDFTransformer implements Transformer { + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf")); + @Override + public Set<MediaType> getSupportedTypes() { + return SUPPORTED_TYPES; + } + + private PDFTransformerConfig config = new PDFTransformerConfig(); + + @Override + public void transform(InputStream is, OutputStream os) throws IOException, TikaException { + try (PDDocument pdDocument = PDDocument.load(is)) { + try (EvilCOSWriter cosWriter = new EvilCOSWriter(os, config)) { + cosWriter.write(pdDocument); + } + } catch (InvalidPasswordException e) { + throw new CantFuzzException("encrypted doc"); + } + } +} diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java new file mode 100644 index 0000000..d152878 --- /dev/null +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.fuzzing.pdf; + +public class PDFTransformerConfig { + + private boolean randomizeObjectNumbers = true; + + public boolean getRandomizeObjectNumbers() { + return randomizeObjectNumbers; + } +} diff --git a/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer b/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer new file mode 100644 index 0000000..07390de --- /dev/null +++ b/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.fuzzing.general.GeneralTransformer +#org.apache.tika.fuzzing.pdf.PDFTransformer \ No newline at end of file diff --git a/tika-fuzzing/src/main/resources/log4j.properties b/tika-fuzzing/src/main/resources/log4j.properties new file mode 100644 index 0000000..7d3b372 --- /dev/null +++ b/tika-fuzzing/src/main/resources/log4j.properties @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#info,debug, error,fatal ... +log4j.rootLogger=info,stderr + +#console +log4j.appender.stderr=org.apache.log4j.ConsoleAppender +log4j.appender.stderr.layout=org.apache.log4j.PatternLayout +log4j.appender.stderr.Target=System.err + +log4j.appender.stderr.layout.ConversionPattern= %-5p %m%n diff --git a/tika-fuzzing/src/test/java/TestFuzzingCLI.java b/tika-fuzzing/src/test/java/TestFuzzingCLI.java new file mode 100644 index 0000000..a98291b --- /dev/null +++ b/tika-fuzzing/src/test/java/TestFuzzingCLI.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.commons.io.FileUtils; +import org.apache.tika.fuzzing.cli.FuzzingCLI; +import org.apache.tika.utils.ProcessUtils; +import org.junit.Ignore; +import org.junit.Test; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class TestFuzzingCLI { + + @Test + @Ignore + public void testBasic() throws Exception { + //convert to actual unit test + String inputDir = "";// fill in + String outputDir = "";//fill in + String[] args = new String[] { + "-i", inputDir, + "-o", outputDir, + "-n", "8", // num threads + "-t", "1", //max transformers + "-p", "100", //per file iterations + "-r", "3" + }; + FuzzingCLI.main(args); + } + + @Test + @Ignore + public void testMock() throws Exception { + //convert to actual unit test + Path inputDir = Paths.get(getClass().getResource("/test-documents").toURI()); + Path outputDir = Files.createTempDirectory("tika-fuzzing-"); + String[] args = new String[] { + "-i", ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()), + "-o", ProcessUtils.escapeCommandLine(outputDir.toAbsolutePath().toString()), + "-n", "8", // num threads + "-t", "0", //max transformers + "-p", "10", //per file iterations + "-m", "10000", //max ms per file + "-r", "3" + }; + try { + FuzzingCLI.main(args); + } finally { + FileUtils.deleteDirectory(outputDir.toFile()); + } + } +} diff --git a/tika-fuzzing/src/test/java/TestTransformer.java b/tika-fuzzing/src/test/java/TestTransformer.java new file mode 100644 index 0000000..1db2e1e --- /dev/null +++ b/tika-fuzzing/src/test/java/TestTransformer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.tika.fuzzing.general.GeneralTransformer; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; + +public class TestTransformer { + + @Test + @Ignore + public void testBasic() throws Exception { + //turn into actual unit test + Path path = Paths.get("");//put something meaningful here + + GeneralTransformer transformer = new GeneralTransformer(); + byte[] bytes = Files.readAllBytes(path); + + for (int i = 0; i < 100; i++) { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + transformer.transform( + new ByteArrayInputStream(bytes), bos); + + if (Arrays.equals(bos.toByteArray(), bytes)) { + System.out.println("SAME"); + } + } + } +} diff --git a/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml b/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml new file mode 100644 index 0000000..f1f5b67 --- /dev/null +++ b/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<mock> + <metadata action="add" name="author">Nikolai Lobachevsky</metadata> + <write element="p">some content</write> + <hang millis="30000" heavy="true" pulse_millis="100" /> +</mock> \ No newline at end of file diff --git a/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml b/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml new file mode 100644 index 0000000..4561c3a --- /dev/null +++ b/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<mock> + <metadata action="add" name="author">Nikolai Lobachevsky</metadata> + <write element="p">some content</write> + <throw class="java.lang.NullPointerException">another null pointer exception</throw> +</mock> \ No newline at end of file diff --git a/tika-fuzzing/src/test/resources/test-documents/system_exit.xml b/tika-fuzzing/src/test/resources/test-documents/system_exit.xml new file mode 100644 index 0000000..75d1d3b --- /dev/null +++ b/tika-fuzzing/src/test/resources/test-documents/system_exit.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<mock> + <metadata action="add" name="author">Nikolai Lobachevsky</metadata> + <write element="p">some content</write> + <system_exit /> +</mock> \ No newline at end of file
