This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 06769d3 Added case-insensitivity to tika server ocr header names
(#414)
06769d3 is described below
commit 06769d336ff314d8243decd697a8e520c954afc6
Author: Subhajit Das <[email protected]>
AuthorDate: Tue Mar 16 02:18:52 2021 +0530
Added case-insensitivity to tika server ocr header names (#414)
---
.../server/classic/config/PDFServerConfig.java | 2 +-
.../classic/config/TesseractServerConfig.java | 2 +-
.../tika/server/classic/TikaResourceTest.java | 47 ++++++++++++++++++++++
.../tika/server/core/resource/TikaResource.java | 14 +++++--
4 files changed, 59 insertions(+), 6 deletions(-)
diff --git
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
index 9dcf61d..9058272 100644
---
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
+++
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
@@ -40,7 +40,7 @@ public class PDFServerConfig implements ParseContextConfig {
//upon server startup will be ignored.
PDFParserConfig pdfParserConfig = null;
for (String key : httpHeaders.keySet()) {
- if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+ if (StringUtils.startsWithIgnoreCase(key,
X_TIKA_PDF_HEADER_PREFIX)) {
pdfParserConfig = (pdfParserConfig == null) ? new
PDFParserConfig() : pdfParserConfig;
processHeaderConfig(httpHeaders, pdfParserConfig, key,
X_TIKA_PDF_HEADER_PREFIX);
}
diff --git
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
index 3db0859..3041400 100644
---
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
+++
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
@@ -40,7 +40,7 @@ public class TesseractServerConfig implements
ParseContextConfig {
TesseractOCRConfig ocrConfig = null;
DocumentSelector documentSelector = null;
for (String key : httpHeaders.keySet()) {
- if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+ if (StringUtils.startsWithIgnoreCase(key,
X_TIKA_OCR_HEADER_PREFIX)) {
ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() :
ocrConfig;
processHeaderConfig(httpHeaders, ocrConfig, key,
X_TIKA_OCR_HEADER_PREFIX);
}
diff --git
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index d80798a..7104484 100644
---
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -42,6 +42,7 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
import static org.junit.Assert.assertEquals;
@@ -351,6 +352,52 @@ public class TikaResourceTest extends CXFTestBase {
assertEquals(400, response.getStatus());
}
+ // TIKA-3320
+ @Test
+ public void testPDFLowerCaseOCRConfig() throws Exception {
+ if (! new TesseractOCRParser().hasTesseract()) {
+ return;
+ }
+
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy",
"no_ocr")
+
.put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+
+ assertTrue(responseMsg.trim().equals(""));
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+
.header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"skipocr",
"true")
+
.put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+
+ assertTrue(responseMsg.trim().equals(""));
+
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy",
"ocr_only")
+
.put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertContains("Happy New Year 2003!", responseMsg);
+
+ //now try a bad value
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT) +
"ocrstrategy", "non-sense-value")
+
.put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ assertEquals(400, response.getStatus());
+ }
+
//TIKA-2669
@Test
public void testPDFConfig() throws Exception {
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 26c6827..d1b25a6 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -165,15 +165,21 @@ public class TikaResource {
val = val.trim();
try {
- String property = StringUtils.removeStart(key, prefix);
+ String property = StringUtils.removeStartIgnoreCase(key, prefix);
Field field = null;
try {
field =
object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
} catch (NoSuchFieldException e) {
- //swallow
+ // try to match field case-insensitive way
+ for(Field aField : object.getClass().getDeclaredFields()) {
+ if (aField.getName().equalsIgnoreCase(property)) {
+ field = aField;
+ break;
+ }
+ }
}
- String setter = property;
- setter =
"set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
+ String setter = field != null ? field.getName() : property;
+ setter = "set" + setter.substring(0, 1).toUpperCase(Locale.US) +
setter.substring(1);
//default assume string class
//if there's a more specific type, e.g. double, int, boolean
//try that.