This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4243-updates
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4243-updates by this push:
new 5829ec1e9 TIKA-4243 -- remove superclass from serialization and
generally clean up, update PDFParserConfig to work with new framework
5829ec1e9 is described below
commit 5829ec1e9de450157c4efdf1f12bed22c53f626d
Author: tallison <[email protected]>
AuthorDate: Fri Jun 7 14:02:30 2024 -0400
TIKA-4243 -- remove superclass from serialization and generally clean up,
update PDFParserConfig to work with new framework
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 45 ++++++++---------
.../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 3 +-
.../tika/parser/pdf/tika-config-non-primitives.xml | 2 +-
.../tika/serialization/TikaJsonSerializer.java | 3 +-
.../apache/tika/server/standard/TikaPipesTest.java | 57 ++++++++++++++++++----
.../tika/server/standard/UnpackerResourceTest.java | 2 +-
.../standard/UnpackerResourceWithConfigTest.java | 4 +-
10 files changed, 76 insertions(+), 46 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 4d0a08226..4bc31c3d9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -673,7 +673,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
BufferedImage image =
- renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType());
+ renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType().getImageType());
//TODO -- get suffix based on OcrImageType
tmpFile = tmpResources.createTempFile();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index f21b65d4e..bfd54d4ef 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -1099,7 +1099,7 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
//set a default renderer if nothing was defined
PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
pdfBoxRenderer.setDPI(config.getOcrDPI());
- pdfBoxRenderer.setImageType(config.getOcrImageType());
+ pdfBoxRenderer.setImageType(config.getOcrImageType().getImageType());
pdfBoxRenderer.setImageFormatName(config.getOcrImageFormatName());
config.setRenderer(pdfBoxRenderer);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 0ee4b274b..be4234b87 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -114,7 +114,7 @@ public class PDFParserConfig implements Serializable {
private OCR_RENDERING_STRATEGY ocrRenderingStrategy =
OCR_RENDERING_STRATEGY.ALL;
private int ocrDPI = 300;
- private ImageType ocrImageType = ImageType.GRAY;
+ private OCR_IMAGE_TYPE ocrImageType = OCR_IMAGE_TYPE.GRAY;
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;
@@ -623,9 +623,9 @@ public class PDFParserConfig implements Serializable {
* Image type used to render the page image for OCR.
*
* @return image type
- * @see #setOcrImageType(ImageType)
+ * @see #setOcrImageType(OCR_IMAGE_TYPE)
*/
- public ImageType getOcrImageType() {
+ public OCR_IMAGE_TYPE getOcrImageType() {
return ocrImageType;
}
@@ -634,7 +634,7 @@ public class PDFParserConfig implements Serializable {
*
* @param ocrImageType
*/
- public void setOcrImageType(ImageType ocrImageType) {
+ public void setOcrImageType(OCR_IMAGE_TYPE ocrImageType) {
this.ocrImageType = ocrImageType;
userConfigured.add("ocrImageType");
}
@@ -642,10 +642,10 @@ public class PDFParserConfig implements Serializable {
/**
* Image type used to render the page image for OCR.
*
- * @see #setOcrImageType(ImageType)
+ * @see #setOcrImageType(OCR_IMAGE_TYPE)
*/
public void setOcrImageType(String ocrImageTypeString) {
- setOcrImageType(parseImageType(ocrImageTypeString));
+ setOcrImageType(OCR_IMAGE_TYPE.valueOf(ocrImageTypeString));
}
/**
@@ -749,26 +749,6 @@ public class PDFParserConfig implements Serializable {
userConfigured.add("setKCMS");
}
- private ImageType parseImageType(String ocrImageType) {
- for (ImageType t : ImageType.values()) {
- if (ocrImageType.equalsIgnoreCase(t.toString())) {
- return t;
- }
- }
- StringBuilder sb = new StringBuilder();
- sb.append("I regret that I could not parse '");
- sb.append(ocrImageType);
- sb.append("'. I'm only familiar with: ");
- int i = 0;
- for (ImageType t : ImageType.values()) {
- if (i++ == 0) {
- sb.append(", ");
- }
- sb.append(t.toString());
- }
- throw new IllegalArgumentException(sb.toString());
- }
-
public boolean isDetectAngles() {
return detectAngles;
}
@@ -1097,4 +1077,17 @@ public class PDFParserConfig implements Serializable {
throw new IllegalArgumentException(sb.toString());
}
}
+
+ public enum OCR_IMAGE_TYPE {
+ GRAY(ImageType.GRAY),
+ RGB(ImageType.RGB);
+ //add more as needed
+ ImageType imageType;
+ OCR_IMAGE_TYPE(ImageType imageType) {
+ this.imageType = imageType;
+ }
+ public ImageType getImageType() {
+ return imageType;
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index b3ce7d9d7..72392bc2d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -226,7 +226,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer,
Initializable {
if (pdfParserConfig == null) {
return defaultImageType;
}
- return pdfParserConfig.getOcrImageType();
+ return pdfParserConfig.getOcrImageType().getImageType();
}
protected String getImageFormatName(ParseContext parseContext) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index a3ed89497..115a9a978 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -38,7 +38,6 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
-import org.apache.pdfbox.rendering.ImageType;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
@@ -1129,7 +1128,7 @@ public class PDFParserTest extends TikaTest {
pdfParser.getClass().getName());
assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY,
((PDFParser)
pdfParser).getPDFParserConfig().getOcrStrategy());
- assertEquals(ImageType.RGB,
+ assertEquals(PDFParserConfig.OCR_IMAGE_TYPE.RGB,
((PDFParser)
pdfParser).getPDFParserConfig().getOcrImageType());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
index 3cc9d8b23..036c8b595 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
@@ -20,7 +20,7 @@
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<param name="sortByPosition" type="bool">true</param>
- <param name="ocrImageType" type="string">rgb</param>
+ <param name="ocrImageType" type="string">RGB</param>
<param name="ocrStrategy" type="string">ocr_only</param>
</params>
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java
index e3615d2d1..01211ff98 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java
@@ -104,8 +104,9 @@ public class TikaJsonSerializer {
.getClass()
.getConstructor();
} catch (NoSuchMethodException e) {
- throw new IllegalArgumentException("class (" + obj.getClass() + ")
doesn't have a no-arg constructor. Respectfully not seralizing.");
+ throw new IllegalArgumentException("class (" + obj.getClass() + ")
doesn't have a no-arg constructor. Respectfully not serializing.");
}
+
try {
if (fieldName != null) {
jsonGenerator.writeFieldName(fieldName);
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
index d84c18d41..e192d485f 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
@@ -38,6 +38,7 @@ import java.util.List;
import java.util.Map;
import jakarta.ws.rs.core.Response;
+import org.apache.commons.io.FileUtils;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
@@ -51,6 +52,7 @@ import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.pipes.emitter.EmitKey;
@@ -76,10 +78,11 @@ public class TikaPipesTest extends CXFTestBase {
private static final String PIPES_PATH = "/pipes";
private static final String TEST_RECURSIVE_DOC =
"test_recursive_embedded.docx";
+ private static final String TEST_TWO_BOXES_PDF = "testPDFTwoTextBoxes.pdf";
+
@TempDir
private static Path TMP_WORKING_DIR;
private static Path TMP_OUTPUT_DIR;
- private static Path TMP_OUTPUT_FILE;
private static Path TMP_BYTES_DIR;
private static Path TIKA_PIPES_LOG4j2_PATH;
private static Path TIKA_CONFIG_PATH;
@@ -91,13 +94,15 @@ public class TikaPipesTest extends CXFTestBase {
Path inputDir = TMP_WORKING_DIR.resolve("input");
TMP_OUTPUT_DIR = TMP_WORKING_DIR.resolve("output");
TMP_BYTES_DIR = TMP_WORKING_DIR.resolve("bytes");
- TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
Files.createDirectories(inputDir);
Files.createDirectories(TMP_OUTPUT_DIR);
Files.copy(TikaPipesTest.class.getResourceAsStream("/test-documents/"
+ TEST_RECURSIVE_DOC), inputDir.resolve("test_recursive_embedded.docx"),
StandardCopyOption.REPLACE_EXISTING);
+ Files.copy(TikaPipesTest.class.getResourceAsStream("/test-documents/"
+ TEST_TWO_BOXES_PDF), inputDir.resolve(TEST_TWO_BOXES_PDF),
+ StandardCopyOption.REPLACE_EXISTING);
+
TIKA_CONFIG_PATH = Files.createTempFile(TMP_WORKING_DIR,
"tika-pipes-", ".xml");
TIKA_PIPES_LOG4j2_PATH = Files.createTempFile(TMP_WORKING_DIR,
"log4j2-", ".xml");
Files.copy(TikaPipesTest.class.getResourceAsStream("/log4j2.xml"),
TIKA_PIPES_LOG4j2_PATH, StandardCopyOption.REPLACE_EXISTING);
@@ -120,11 +125,8 @@ public class TikaPipesTest extends CXFTestBase {
@BeforeEach
public void setUpEachTest() throws Exception {
- if (Files.exists(TMP_OUTPUT_FILE)) {
- Files.delete(TMP_OUTPUT_FILE);
- }
-
- assertFalse(Files.isRegularFile(TMP_OUTPUT_FILE));
+ FileUtils.deleteDirectory(TMP_OUTPUT_DIR.toFile());
+ assertFalse(Files.isDirectory(TMP_OUTPUT_DIR));
}
@Override
@@ -173,7 +175,8 @@ public class TikaPipesTest extends CXFTestBase {
assertEquals(200, response.getStatus());
List<Metadata> metadataList = null;
- try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+ Path outputFile = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
+ try (Reader reader = Files.newBufferedReader(outputFile)) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(12, metadataList.size());
@@ -182,6 +185,38 @@ public class TikaPipesTest extends CXFTestBase {
.get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testPDFConfig() throws Exception {
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ PDFParserConfig pdfParserConfig = new PDFParserConfig();
+ pdfParserConfig.setSortByPosition(true);
+ parseContext.set(PDFParserConfig.class, pdfParserConfig);
+
+ FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey("fsf",
TEST_TWO_BOXES_PDF),
+ new EmitKey("fse", ""), metadata, parseContext);
+ StringWriter writer = new StringWriter();
+ JsonFetchEmitTuple.toJson(t, writer);
+ String getUrl = endPoint + PIPES_PATH;
+ Response response = WebClient
+ .create(getUrl)
+ .accept("application/json")
+ .post(writer.toString());
+ assertEquals(200, response.getStatus());
+
+ List<Metadata> metadataList = null;
+ Path outputFile = TMP_OUTPUT_DIR.resolve(TEST_TWO_BOXES_PDF + ".json");
+ try (Reader reader = Files.newBufferedReader(outputFile)) {
+ metadataList = JsonMetadataList.fromJson(reader);
+ }
+ String content =
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
+ content = content.replaceAll("\\s+", " ");
+ // Column text is now interleaved:
+ assertContains(
+ "Left column line 1 Right column line 1 Left colu mn line 2
Right column line 2",
+ content);
+ }
+
@Test
public void testConcatenated() throws Exception {
ParseContext parseContext = new ParseContext();
@@ -205,7 +240,8 @@ public class TikaPipesTest extends CXFTestBase {
assertEquals(200, response.getStatus());
List<Metadata> metadataList = null;
- try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+ Path outputFile = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
+ try (Reader reader = Files.newBufferedReader(outputFile)) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(1, metadataList.size());
@@ -241,7 +277,8 @@ public class TikaPipesTest extends CXFTestBase {
assertEquals(200, response.getStatus());
List<Metadata> metadataList = null;
- try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+ Path outputFile = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
+ try (Reader reader = Files.newBufferedReader(outputFile)) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(12, metadataList.size());
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index bd9934dc6..e8ceec7b6 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -268,7 +268,7 @@ public class UnpackerResourceTest extends CXFTestBase {
Response response = WebClient
.create(CXFTestBase.endPoint + ALL_PATH)
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"imageStrategy", "RenderPagesAtPageEnd")
- .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "rgb")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "RGB")
.accept("application/zip")
.put(ClassLoader.getSystemResourceAsStream("test-documents/testColorRendering.pdf"));
Map<String, byte[]> results = readZipArchiveBytes((InputStream)
response.getEntity());
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
index a8172d5fa..4f8bcdd75 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
@@ -79,7 +79,7 @@ public class UnpackerResourceWithConfigTest extends
CXFTestBase {
Response response = WebClient
.create(CXFTestBase.endPoint + ALL_PATH)
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"imageStrategy", "RenderPagesAtPageEnd")
- .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "rgb")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "RGB")
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageFormatName", "tiff")
.accept("application/zip")
.put(ClassLoader.getSystemResourceAsStream("test-documents/testColorRendering.pdf"));
@@ -131,7 +131,7 @@ public class UnpackerResourceWithConfigTest extends
CXFTestBase {
Response response = WebClient
.create(CXFTestBase.endPoint + ALL_PATH)
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"imageStrategy", "RenderPagesAtPageEnd")
- .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "gray")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "GRAY")
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageFormatName", "jpeg")
.accept("application/zip")
.put(ClassLoader.getSystemResourceAsStream("test-documents/testColorRendering.pdf"));