This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 5a6089f add -C/--content cli option using WriteOutContentHandler
(#312)
5a6089f is described below
commit 5a6089f84b47940fe00919a704678aeb541c2669
Author: Alexander Klimetschek <[email protected]>
AuthorDate: Wed Oct 7 10:47:33 2020 -0700
add -C/--content cli option using WriteOutContentHandler (#312)
---
tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 14 +++++++++++++-
.../src/test/java/org/apache/tika/cli/TikaCLITest.java | 14 ++++++++++++++
2 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index b57822c..4357add 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -107,6 +107,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.xmp.XMPMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -264,7 +265,15 @@ public class TikaCLI {
return new BoilerpipeContentHandler(getOutputWriter(output,
encoding));
}
};
-
+
+ private final OutputType CONTENT = new OutputType() {
+ @Override
+ protected ContentHandler getContentHandler(
+ OutputStream output, Metadata metadata) throws Exception {
+ return new WriteOutContentHandler(getOutputWriter(output,
encoding));
+ }
+ };
+
private final OutputType METADATA = new OutputType() {
@Override
protected ContentHandler getContentHandler(
@@ -440,6 +449,8 @@ public class TikaCLI {
type = TEXT;
} else if (arg.equals("-T") || arg.equals("--text-main")) {
type = TEXT_MAIN;
+ } else if (arg.equals("-C") || arg.equals("--content")) {
+ type = CONTENT;
} else if (arg.equals("-m") || arg.equals("--metadata")) {
type = METADATA;
} else if (arg.equals("-l") || arg.equals("--language")) {
@@ -567,6 +578,7 @@ public class TikaCLI {
out.println(" -h or --html Output HTML content");
out.println(" -t or --text Output plain text content");
out.println(" -T or --text-main Output plain text content
(main content only)");
+ out.println(" -C or --content Output all text content");
out.println(" -m or --metadata Output only metadata");
out.println(" -j or --json Output metadata in JSON");
out.println(" -y or --xmp Output metadata in XMP");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 2790690..1878ed6 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -142,6 +142,20 @@ public class TikaCLITest {
}
/**
+ * Tests -C option of the cli
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testContentOutput() throws Exception{
+ String[] params = {"-C", resourcePrefix + "testJsonMultipleInts.html"};
+ TikaCLI.main(params);
+ String out = outContent.toString(UTF_8.name());
+ assertTrue(out.contains("this is a title"));
+ assertTrue(out.contains("body"));
+ }
+
+ /**
* Tests -f option of the cli
*
* @throws Exception