This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9dfccdebdc02fc18bb94badaa8a71f93c3b26690 Author: Alexander Klimetschek <[email protected]> AuthorDate: Wed Oct 7 10:47:33 2020 -0700 add -C/--content cli option using WriteOutContentHandler (#312) --- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 14 +++++++++++++- .../src/test/java/org/apache/tika/cli/TikaCLITest.java | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 7a3a30c..216f9c6 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -105,6 +105,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.xmp.XMPMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -264,7 +265,15 @@ public class TikaCLI { return new BoilerpipeContentHandler(getOutputWriter(output, encoding)); } }; - + + private final OutputType CONTENT = new OutputType() { + @Override + protected ContentHandler getContentHandler( + OutputStream output, Metadata metadata) throws Exception { + return new WriteOutContentHandler(getOutputWriter(output, encoding)); + } + }; + private final OutputType METADATA = new OutputType() { @Override protected ContentHandler getContentHandler( @@ -438,6 +447,8 @@ public class TikaCLI { type = TEXT; } else if (arg.equals("-T") || arg.equals("--text-main")) { type = TEXT_MAIN; + } else if (arg.equals("-C") || arg.equals("--content")) { + type = CONTENT; } else if (arg.equals("-m") || arg.equals("--metadata")) { type = METADATA; } else if (arg.equals("-l") || arg.equals("--language")) { @@ -563,6 +574,7 @@ public class TikaCLI { out.println(" -h or --html Output HTML content"); out.println(" -t or --text Output plain text content"); out.println(" -T or --text-main Output plain text content (main content only)"); + out.println(" -C or --content Output all text content"); out.println(" -m or --metadata Output only metadata"); out.println(" -j or --json Output metadata in JSON"); out.println(" -y or --xmp Output metadata in XMP"); diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 0a0ae17..955433e 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -154,6 +154,20 @@ public class TikaCLITest { } /** + * Tests -C option of the cli + * + * @throws Exception + */ + @Test + public void testContentOutput() throws Exception{ + String[] params = {"-C", resourcePrefix + "testJsonMultipleInts.html"}; + TikaCLI.main(params); + String out = outContent.toString(UTF_8.name()); + assertTrue(out.contains("this is a title")); + assertTrue(out.contains("body")); + } + + /** * Tests -f option of the cli * * @throws Exception
