This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2468e43b1 TIKA-3799 -- Refactor FuzzingCLI to use PipesParser
2468e43b1 is described below

commit 2468e43b1eed19409bfeb8749b02e0d0350d872b
Author: tallison <[email protected]>
AuthorDate: Wed Jun 22 16:14:50 2022 -0400

    TIKA-3799 -- Refactor FuzzingCLI to use PipesParser
---
 CHANGES.txt                                        |   2 +
 tika-fuzzing/pom.xml                               |  11 +-
 .../org/apache/tika/fuzzing/cli/FuzzingCLI.java    | 325 ++++++++++++---------
 .../apache/tika/fuzzing/cli/FuzzingCLIConfig.java  |  97 +++---
 .../tika/fuzzing/general/GeneralTransformer.java   |   1 +
 tika-fuzzing/src/main/resources/log4j2.xml         |   6 +
 .../test/resources/configs/tika-fuzzing-config.xml |  69 +++++
 tika-fuzzing/src/test/resources/log4j2.xml         |  14 +-
 8 files changed, 319 insertions(+), 206 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 76fa225c0..11f170b1b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.4.2 - ???
 
+   * Refactor FuzzingCLI to use PipesParser (TIKA-3799).
+
    * ServiceLoader's loadServiceProviders() now guarantees
      unique classes (TIKA-3797).
 
diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml
index 6cd5d0a0a..835d4e125 100644
--- a/tika-fuzzing/pom.xml
+++ b/tika-fuzzing/pom.xml
@@ -37,7 +37,16 @@
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-core</artifactId>
       <version>${project.version}</version>
-      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-emitter-fs</artifactId>
+      <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>commons-cli</groupId>
diff --git 
a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java 
b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
index 10453e6be..53cb22b40 100644
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
@@ -17,12 +17,12 @@
 package org.apache.tika.fuzzing.cli;
 
 import java.io.IOException;
-import java.nio.file.FileVisitResult;
-import java.nio.file.FileVisitor;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.attribute.BasicFileAttributes;
+import java.util.Locale;
+import java.util.UUID;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
@@ -33,205 +33,250 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.FilenameUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fuzzing.Transformer;
+import org.apache.tika.fuzzing.general.ByteDeleter;
+import org.apache.tika.fuzzing.general.ByteFlipper;
+import org.apache.tika.fuzzing.general.ByteInjector;
+import org.apache.tika.fuzzing.general.GeneralTransformer;
+import org.apache.tika.fuzzing.general.SpanSwapper;
+import org.apache.tika.fuzzing.general.Truncator;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.PipesConfig;
+import org.apache.tika.pipes.PipesParser;
+import org.apache.tika.pipes.PipesResult;
+import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.fetcher.FetchKey;
+import org.apache.tika.pipes.fetcher.FetcherManager;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
 
 public class FuzzingCLI {
-    private static final Logger LOG = 
LoggerFactory.getLogger(FuzzingCLI.class);
-
-    private static final Path POISON = Paths.get("");
 
-    private final int maxFiles = -1;
+    private static final Logger LOG = 
LoggerFactory.getLogger(FuzzingCLI.class);
+    private static final String TEMP_FETCHER_NAME = "temp";
+    private static final String TEMP_EMITTER_NAME = "temp";
 
     public static void main(String[] args) throws Exception {
         FuzzingCLIConfig config = FuzzingCLIConfig.parse(args);
         if (config.getMaxTransformers() == 0) {
             LOG.warn("max transformers == 0!");
         }
-        if (!Files.isDirectory(config.inputDir)) {
-            throw new IllegalArgumentException("input directory doesn't exist: 
" + config.inputDir);
-        }
+
         FuzzingCLI fuzzingCLI = new FuzzingCLI();
-        Files.createDirectories(config.getOutputDirectory());
+        Files.createDirectories(config.getProblemsDirectory());
         fuzzingCLI.execute(config);
     }
 
-    private void execute(FuzzingCLIConfig config) {
-        ArrayBlockingQueue<Path> q = new ArrayBlockingQueue(10000);
-        ExecutorService executorService = 
Executors.newFixedThreadPool(config.getNumThreads() + 1);
+
+    private void execute(FuzzingCLIConfig config) throws Exception {
+        ArrayBlockingQueue<FetchEmitTuple> q = new ArrayBlockingQueue(10000);
+
+        PipesConfig pipesConfig = PipesConfig.load(config.getTikaConfig());
+        FetcherManager fetcherManager = 
FetcherManager.load(config.getTikaConfig());
+
+        int totalThreads = pipesConfig.getNumClients() + 1;
+
+        ExecutorService executorService = 
Executors.newFixedThreadPool(totalThreads);
         ExecutorCompletionService executorCompletionService =
                 new ExecutorCompletionService(executorService);
-        FileAdder fileAdder = new FileAdder(config.getInputDirectory(), 
config.getNumThreads(), q);
+        PipesIterator pipesIterator = 
PipesIterator.build(config.getTikaConfig());
+
+        FileAdder fileAdder = new FileAdder(pipesIterator, q);
         executorCompletionService.submit(fileAdder);
-        for (int i = 0; i < config.numThreads; i++) {
-            executorCompletionService.submit(new Fuzzer(q, config));
-        }
-        int finished = 0;
-        while (finished < config.getNumThreads() + 1) {
-            Future<Integer> future = null;
-            try {
-                future = executorCompletionService.poll(1, TimeUnit.SECONDS);
-                if (future != null) {
-                    future.get();
-                    finished++;
+        try (PipesParser parser = new PipesParser(pipesConfig)) {
+
+            for (int i = 0; i < pipesConfig.getNumClients(); i++) {
+                executorCompletionService.submit(new Fuzzer(q, config, parser, 
fetcherManager));
+            }
+            int finished = 0;
+            while (finished < totalThreads) {
+                Future<Integer> future = null;
+                try {
+                    future = executorCompletionService.poll(1, 
TimeUnit.SECONDS);
+                    if (future != null) {
+                        future.get();
+                        finished++;
+                    }
+                    LOG.info("Finished thread {} threads of {}", finished, 
totalThreads);
+                } catch (InterruptedException | ExecutionException e) {
+                    e.printStackTrace();
+                    break;
                 }
-            } catch (InterruptedException | ExecutionException e) {
-                e.printStackTrace();
-                break;
             }
+            executorService.shutdown();
+            executorService.shutdownNow();
         }
-        executorService.shutdownNow();
+
     }
 
     private static class Fuzzer implements Callable<Integer> {
         static AtomicInteger COUNTER = new AtomicInteger();
+        static AtomicInteger FUZZED = new AtomicInteger();
+        static AtomicInteger SOURCE_FILES = new AtomicInteger();
         private final int threadId = COUNTER.getAndIncrement();
-        private final ArrayBlockingQueue<Path> q;
+        private final ArrayBlockingQueue<FetchEmitTuple> q;
         private final FuzzingCLIConfig config;
 
-        public Fuzzer(ArrayBlockingQueue<Path> q, FuzzingCLIConfig config) {
+        private final PipesParser pipesParser;
+
+        private final Transformer transformer;
+
+        private final FetcherManager fetcherManager;
+
+        public Fuzzer(ArrayBlockingQueue<FetchEmitTuple> q, FuzzingCLIConfig 
config,
+                      PipesParser pipesParser, FetcherManager fetcherManager) {
             this.q = q;
             this.config = config;
+            this.pipesParser = pipesParser;
+            //TODO - parameterize this
+            this.transformer =
+                    new GeneralTransformer(config.getMaxTransformers(), new 
ByteDeleter(),
+                            new ByteFlipper(), new ByteInjector(), new 
Truncator(),
+                            new SpanSwapper());
+            this.fetcherManager = fetcherManager;
         }
 
         @Override
         public Integer call() throws Exception {
             while (true) {
-                Path p = q.take();
-                if (p.equals(POISON)) {
+                FetchEmitTuple fetchEmitTuple = q.take();
+                if (fetchEmitTuple.equals(PipesIterator.COMPLETED_SEMAPHORE)) {
                     LOG.debug("Thread " + threadId + " stopping");
+                    q.put(PipesIterator.COMPLETED_SEMAPHORE);
                     return 1;
                 }
-                boolean success = false;
-                int tries = 0;
-                while (!success && tries < config.getRetries()) {
-                    if (tries > 0) {
-                        LOG.warn("Retrying (" + tries + ") " + p);
+                int inputFiles = SOURCE_FILES.getAndIncrement();
+                if (inputFiles % 100 == 0) {
+                    LOG.info("Processed {} source files", inputFiles);
+                }
+                for (int i = 0; i < config.perFileIterations; i++) {
+                    try {
+                        fuzzIt(fetchEmitTuple);
+                    } catch (InterruptedException e) {
+                        throw e;
+                    } catch (Exception e) {
+                        LOG.warn("serious problem with", e);
                     }
-                    success = fuzzIt(config, p, tries);
-                    tries++;
                 }
             }
         }
 
-        private boolean fuzzIt(FuzzingCLIConfig config, Path p, int retryId) {
-            //the target files should be flattened so that
-            //problematic files are all in one directory...may rethink this 
option later
-            Path target = config.getOutputDirectory().resolve(p.getFileName());
-            String cp = System.getProperty("java.class.path");
-
-            String[] args =
-                    new String[]{"java", "-XX:-OmitStackTraceInFastThrow", 
"-Xmx" + config.xmx,
-                            "-ea", "-cp", ProcessUtils.escapeCommandLine(cp),
-                            "org.apache.tika.fuzzing.cli.FuzzOne", "-i",
-                            
ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()), "-o",
-                            
ProcessUtils.escapeCommandLine(target.toAbsolutePath().toString()),
-                            "-p", 
Integer.toString(config.getPerFileIterations()), "-t",
-                            Integer.toString(config.getMaxTransformers()), 
"-n",
-                            Integer.toString(threadId), "-r", 
Integer.toString(retryId), "-m",
-                            Long.toString(config.getTimeoutMs())};
-            ProcessBuilder pb = new ProcessBuilder(args);
-            pb.inheritIO();
-            Process process = null;
-            boolean success = false;
-            try {
-                process = pb.start();
-            } catch (IOException e) {
-                LOG.warn("problem starting process", e);
-            }
+        private void fuzzIt(FetchEmitTuple fetchEmitTuple)
+                throws IOException, InterruptedException, TikaException {
+            Path cwd = Files.createTempDirectory("tika-fuzz-");
             try {
-                long totalTime = 2 * config.getTimeoutMs() * 
config.getPerFileIterations();
-                success = process.waitFor(totalTime, TimeUnit.MILLISECONDS);
-            } catch (InterruptedException e) {
-                LOG.warn("problem waiting for process to finish", e);
-            } finally {
-                if (process.isAlive()) {
-                    LOG.warn("process still alive for " + 
target.toAbsolutePath());
-                    process.destroyForcibly();
+                Path fuzzedPath = fuzz(fetchEmitTuple, cwd);
+                Path extract = Files.createTempFile(cwd, "tika-extract-", 
".json");
+                FetchEmitTuple fuzzedTuple = new 
FetchEmitTuple(fetchEmitTuple.getId(),
+                        new FetchKey(TEMP_FETCHER_NAME, 
fuzzedPath.toAbsolutePath().toString()),
+                        new EmitKey(TEMP_EMITTER_NAME, 
extract.toAbsolutePath().toString()));
+                int count = FUZZED.getAndIncrement();
+                if (count % 100 == 0) {
+                    LOG.info("processed {} fuzzed files", count);
                 }
-                try {
-                    int exitValue = process.exitValue();
-                    if (exitValue != 0) {
-                        success = false;
-                        LOG.warn("bad exit value for " + 
target.toAbsolutePath());
+                boolean tryAgain = true;
+                int tries = 0;
+                while (tryAgain && tries < config.getRetries()) {
+                    tries++;
+                    try {
+                        PipesResult result = pipesParser.parse(fuzzedTuple);
+                        tryAgain = handleResult(result.getStatus(),
+                                fetchEmitTuple.getFetchKey().getFetchKey(), 
fuzzedPath, tries,
+                                config.getRetries());
+                    } catch (InterruptedException e) {
+                        throw e;
+                    } catch (Exception e) {
+                        tryAgain = 
handleResult(PipesResult.STATUS.UNSPECIFIED_CRASH,
+                                fetchEmitTuple.getFetchKey().getFetchKey(), 
fuzzedPath, tries,
+                                config.getRetries());
                     }
-                } catch (IllegalThreadStateException e) {
-                    success = false;
-                    LOG.warn("not exited");
-                    process.destroyForcibly();
+                }
+            } finally {
+                try {
+                    FileUtils.deleteDirectory(cwd.toFile());
+                } catch (IOException e) {
+                    e.printStackTrace();
+                    LOG.warn("Couldn't delete " + cwd.toAbsolutePath(), e);
+                }
+            }
+        }
+
+        private Path fuzz(FetchEmitTuple fetchEmitTuple, Path cwd)
+                throws IOException, TikaException {
+            Path target = Files.createTempFile(cwd, "tika-fuzz-target-",
+                    "." + 
FilenameUtils.getExtension(fetchEmitTuple.getFetchKey().getFetchKey()));
+            try (InputStream is = fetcherManager.getFetcher(
+                            fetchEmitTuple.getFetchKey().getFetcherName())
+                    .fetch(fetchEmitTuple.getFetchKey().getFetchKey(), new 
Metadata())) {
+                try (OutputStream os = Files.newOutputStream(target)) {
+                    transformer.transform(is, os);
                 }
             }
-            return success;
+            return target;
+        }
+
+        private boolean handleResult(PipesResult.STATUS status, String 
origFetchKey,
+                                     Path fuzzedPath, int tries, int 
maxRetries)
+                throws IOException {
+            switch (status) {
+                case OOM:
+                case TIMEOUT:
+                case UNSPECIFIED_CRASH:
+                    if (tries < maxRetries) {
+                        LOG.info("trying again ({} of {}) {} : {}", tries, 
maxRetries,
+                                status.name());
+                        return true;
+                    }
+                    Path problemFilePath = getProblemFile(status, 
origFetchKey);
+                    LOG.info("found a problem {} -> {} : {}", origFetchKey, 
problemFilePath,
+                            status.name());
+                    Files.copy(fuzzedPath, problemFilePath);
+                    return false;
+                default:
+                    //if there wasn't a problem
+                    return false;
+            }
+        }
+
+        private Path getProblemFile(PipesResult.STATUS status, String 
origFetchKey)
+                throws IOException {
+            String name = FilenameUtils.getName(origFetchKey) + "-" + 
UUID.randomUUID();
+            Path problemFile =
+                    
config.getProblemsDirectory().resolve(status.name().toLowerCase(Locale.US))
+                            .resolve(name);
+            Files.createDirectories(problemFile.getParent());
+            return problemFile;
         }
 
     }
 
     private class FileAdder implements Callable<Integer> {
-        private final Path inputDir;
-        private final int numThreads;
-        private final ArrayBlockingQueue<Path> queue;
+        private final PipesIterator pipesIterator;
+        private final ArrayBlockingQueue<FetchEmitTuple> queue;
         private int added = 0;
 
-        public FileAdder(Path inputDirectory, int numThreads, 
ArrayBlockingQueue<Path> queue) {
-            this.inputDir = inputDirectory;
-            this.numThreads = numThreads;
+        public FileAdder(PipesIterator pipesIterator, 
ArrayBlockingQueue<FetchEmitTuple> queue) {
+            this.pipesIterator = pipesIterator;
             this.queue = queue;
         }
 
         @Override
         public Integer call() throws Exception {
-            Files.walkFileTree(inputDir, new DirWalker());
-            for (int i = 0; i < numThreads; i++) {
-                queue.add(POISON);
+            int added = 0;
+            for (FetchEmitTuple tuple : pipesIterator) {
+                //hang forever -- should offer and timeout
+                queue.put(tuple);
+                added++;
             }
+            queue.put(PipesIterator.COMPLETED_SEMAPHORE);
+            LOG.info("file adder finished " + added);
             return 1;
         }
-
-        private class DirWalker implements FileVisitor<Path> {
-
-            @Override
-            public FileVisitResult preVisitDirectory(Path dir, 
BasicFileAttributes attrs)
-                    throws IOException {
-                return FileVisitResult.CONTINUE;
-            }
-
-            @Override
-            public FileVisitResult visitFile(Path file, BasicFileAttributes 
attrs)
-                    throws IOException {
-                if (maxFiles > -1 && added >= maxFiles) {
-                    LOG.info("hit maxfiles; file crawler is stopping early");
-                    return FileVisitResult.TERMINATE;
-                }
-                if (!file.getFileName().toString().contains("sas7bdat")) {
-                    return FileVisitResult.CONTINUE;
-                }
-                try {
-                    boolean offered = queue.offer(file, 10, TimeUnit.MINUTES);
-                    if (offered) {
-                        added++;
-                        return FileVisitResult.CONTINUE;
-                    } else {
-                        LOG.error("couldn't add a file after 10 minutes!");
-                        return FileVisitResult.TERMINATE;
-                    }
-                } catch (InterruptedException e) {
-                    e.printStackTrace();
-                    return FileVisitResult.TERMINATE;
-                }
-            }
-
-            @Override
-            public FileVisitResult visitFileFailed(Path file, IOException exc) 
throws IOException {
-                return FileVisitResult.CONTINUE;
-            }
-
-            @Override
-            public FileVisitResult postVisitDirectory(Path dir, IOException 
exc)
-                    throws IOException {
-                return FileVisitResult.CONTINUE;
-            }
-        }
     }
 }
diff --git 
a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java 
b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
index c741616d8..1a58b72a5 100644
--- 
a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
+++ 
b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
@@ -28,105 +28,76 @@ import org.apache.commons.cli.ParseException;
 
 public class FuzzingCLIConfig {
 
-    private static final int DEFAULT_NUM_THREADS = 4;
-    private static final int DEFAULT_NUM_ITERATIONS = 1000;
-    //allow all transformers to operate
-    private static final int DEFAULT_MAX_TRANSFORMERS = -1;
-
-    private static final long DEFAULT_TIMEOUT_MS = 120000;
+    private static final int DEFAULT_NUM_ITERATIONS = 100;
 
-    private static final int DEFAULT_RETRIES = 2;
+    //allow all transformers to operate
+    private static final int DEFAULT_MAX_TRANSFORMERS = 1;
 
-    private static final String DEFAULT_XMX = "512m";
+    private static final int DEFAULT_RETRIES = 1;
 
     static Options OPTIONS;
 
     static {
-        //By the time this commandline is parsed, there should be both an 
extracts and an inputDir
-        Option extracts = new Option("extracts", true, "directory for extract 
files");
-        extracts.setRequired(true);
-
-
-        OPTIONS = new Options().addOption(
-                Option.builder("i").longOpt("inputDir").desc("input directory 
for seed files")
-                        .hasArg(true).required(true).build()).addOption(
-                Option.builder("o").longOpt("outputDir")
-                        .desc("output directory for files that triggered 
problems").hasArg(true)
-                        .required(true).build()).addOption(
-                Option.builder("n").longOpt("numThreads").desc("number of 
threads").hasArg(true)
-                        
.required(false).build()).addOption(Option.builder("p").longOpt("perFile")
+        Option problems = new Option("o", "output", true, "directory for 
problems files");
+        problems.setRequired(true);
+
+
+        OPTIONS = new Options().addOption(problems)
+                .addOption(Option.builder("c").longOpt("config").hasArg(true)
+                        .desc("tika config " +
+                                "file with " +
+                                "specs for pipes parser, pipes iterator, 
fetchers and emitters")
+                        .required(true).build())
+                .addOption(Option.builder("p").longOpt("perFile")
                 .desc("number of iterations to run per seed 
file").hasArg(true).required(false)
-                
.build()).addOption(Option.builder("t").longOpt("maxTransformers")
+                .build())
+                .addOption(Option.builder("t").longOpt("maxTransformers")
                 .desc("maximum number of transformers to run per 
iteration").hasArg(true)
-                
.required(false).build()).addOption(Option.builder("m").longOpt("timeoutMs")
-                .desc("timeout in ms -- max time allowed to parse a 
file").hasArg(true)
-                
.required(false).build()).addOption(Option.builder("x").longOpt("xmx")
-                .desc("e.g. 1G, max heap appended to -Xmx in the forked 
process").hasArg(true)
-                
.required(false).build()).addOption(Option.builder("r").longOpt("retries")
+                .required(false).build())
+                .addOption(Option.builder("r").longOpt("retries")
                 .desc("number of times to retry a seed file if there's a 
catastrophic failure")
                 .hasArg(true).required(false).build());
 
     }
-
-    int numThreads;
     //number of variants tried per file
-    int perFileIterations;
+    int perFileIterations = DEFAULT_NUM_ITERATIONS;
     //maxTransformers per file
-    int maxTransformers;
+    int maxTransformers = DEFAULT_MAX_TRANSFORMERS;
     //max time allowed to process each file in milliseconds
     long timeoutMS;
     //times to retry a seed file after a catastrophic failure
-    int retries;
-    //xmx for forked process, e.g. 512m or 1G
-    String xmx;
-    Path inputDir;
-    Path outputDir;
+    int retries = DEFAULT_RETRIES;
+
+    Path tikaConfig;
+
+    Path problemsDir;
 
     public static FuzzingCLIConfig parse(String[] args) throws ParseException {
         CommandLineParser parser = new DefaultParser();
         CommandLine commandLine = parser.parse(OPTIONS, args);
         FuzzingCLIConfig config = new FuzzingCLIConfig();
-        config.inputDir = Paths.get(commandLine.getOptionValue("i"));
-        config.outputDir = Paths.get(commandLine.getOptionValue("o"));
-        config.numThreads =
-                (commandLine.hasOption("n")) ? 
Integer.parseInt(commandLine.getOptionValue("n")) :
-                        DEFAULT_NUM_THREADS;
-        config.perFileIterations =
-                (commandLine.hasOption("p")) ? 
Integer.parseInt(commandLine.getOptionValue("p")) :
-                        DEFAULT_NUM_ITERATIONS;
-        config.maxTransformers =
-                (commandLine.hasOption("t")) ? 
Integer.parseInt(commandLine.getOptionValue("t")) :
-                        DEFAULT_MAX_TRANSFORMERS;
-        config.timeoutMS =
-                (commandLine.hasOption("m")) ? 
Integer.parseInt(commandLine.getOptionValue("m")) :
-                        DEFAULT_TIMEOUT_MS;
+        config.tikaConfig = Paths.get(commandLine.getOptionValue("c"));
+        config.problemsDir = Paths.get(commandLine.getOptionValue("o"));
         config.retries =
                 (commandLine.hasOption("r")) ? 
Integer.parseInt(commandLine.getOptionValue("r")) :
                         DEFAULT_RETRIES;
-        config.xmx = (commandLine.hasOption("x")) ? 
commandLine.getOptionValue("x") : DEFAULT_XMX;
+        config.maxTransformers = (commandLine.hasOption("t")) ?
+                Integer.parseInt(commandLine.getOptionValue("t")) : 
DEFAULT_MAX_TRANSFORMERS;
         return config;
     }
 
-    public int getNumThreads() {
-        return numThreads;
-    }
-
-    public Path getInputDirectory() {
-        return inputDir;
+    public Path getProblemsDirectory() {
+        return problemsDir;
     }
 
-    public Path getOutputDirectory() {
-        return outputDir;
+    public Path getTikaConfig() {
+        return tikaConfig;
     }
 
     public int getMaxTransformers() {
         return maxTransformers;
     }
 
-    public long getTimeoutMs() {
-        return timeoutMS;
-    }
-
     public int getPerFileIterations() {
         return perFileIterations;
     }
diff --git 
a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
 
b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
index 62f3fc940..20ca55ff3 100644
--- 
a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
+++ 
b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
@@ -70,6 +70,7 @@ public class GeneralTransformer implements Transformer {
     public void transform(InputStream is, OutputStream os) throws IOException, 
TikaException {
         //used for debugging
         if (maxTransforms == 0) {
+            IOUtils.copy(is, os);
             return;
         }
         int transformerCount = (maxTransforms == 1) ? 1 : 1 + 
random.nextInt(maxTransforms);
diff --git a/tika-fuzzing/src/main/resources/log4j2.xml 
b/tika-fuzzing/src/main/resources/log4j2.xml
index 513b667a8..94ac22b3e 100644
--- a/tika-fuzzing/src/main/resources/log4j2.xml
+++ b/tika-fuzzing/src/main/resources/log4j2.xml
@@ -28,5 +28,11 @@
     <Root level="info">
       <AppenderRef ref="Console"/>
     </Root>
+    <Logger name="org.apache.tika.pipes" level="error" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
+    <Logger name="com.github.junrar" level="error" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
   </Loggers>
 </Configuration>
diff --git a/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml 
b/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml
new file mode 100644
index 000000000..4b255b06b
--- /dev/null
+++ b/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<!-- this is an example configuration file to run the fuzzer against
+  an input directory.  Make sure to specify the input file directory
+    in the base paths.  We need the "empty" fetchers and emitters to
+    handle the temp files that are created via fuzzing-->
+<properties>
+       <fetchers>
+               <fetcher 
class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+                       <params>
+                               <name>fsf</name>
+                               <basePath>{FILL_IN_HERE}</basePath>
+                       </params>
+               </fetcher>
+               <fetcher 
class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+                       <params>
+                               <name>temp</name>
+                       </params>
+               </fetcher>
+       </fetchers>
+       <emitters>
+               <emitter 
class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
+                       <params>
+                               <name>fse</name>
+                               <basePath>{FILL_IN_HERE}</basePath>
+                       </params>
+               </emitter>
+               <emitter 
class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
+                       <params>
+                               <name>temp</name>
+                       </params>
+               </emitter>
+       </emitters>
+       <pipesIterator 
class="org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator">
+               <params>
+                       <basePath>{FILL_IN_HERE}</basePath>
+                       <fetcherName>fsf</fetcherName>
+                       <emitterName>fse</emitterName>
+               </params>
+       </pipesIterator>
+       <pipes>
+               <params>
+                       <numClients>5</numClients>
+                       <forkedJvmArgs>
+                               <arg>-Xmx1g</arg>
+                               <arg>-XX:ParallelGCThreads=2</arg>
+                               
<arg>-Dlog4j.configurationFile={FILL_IN_HERE}</arg>
+                       </forkedJvmArgs>
+                       <timeoutMillis>10000</timeoutMillis>
+               </params>
+       </pipes>
+</properties>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/log4j2.xml 
b/tika-fuzzing/src/test/resources/log4j2.xml
index 611f36d37..eaeca677e 100644
--- a/tika-fuzzing/src/test/resources/log4j2.xml
+++ b/tika-fuzzing/src/test/resources/log4j2.xml
@@ -25,8 +25,18 @@
     </Console>
   </Appenders>
   <Loggers>
-    <Root level="debug">
+    <Root level="info">
       <AppenderRef ref="Console"/>
     </Root>
+    <Logger name="org.apache.tika.pipes" level="error" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
+    <Logger name="com.github.junrar" level="error" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
+    <Logger name="org.apache.pdfbox" level="fatal" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
+
   </Loggers>
-</Configuration>
+</Configuration>
\ No newline at end of file

Reply via email to