This is an automated email from the ASF dual-hosted git repository.
janhoy pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 22766e82ec5 SOLR-17888 Mitigate CVE-2025-54988 in tika pdf parser
(#3932)
22766e82ec5 is described below
commit 22766e82ec5950dabe9a30c53e39113e0dfc1a55
Author: Jan Høydahl <[email protected]>
AuthorDate: Sun Dec 14 18:35:27 2025 +0100
SOLR-17888 Mitigate CVE-2025-54988 in tika pdf parser (#3932)
---
...17888-mitigate-tika-cve-disable-xfa-parsing.yml | 9 ++
.../handler/extraction/ParseContextConfig.java | 40 +++++
.../src/test-files/extraction/pdf-with-xfa-xxe.pdf | 100 +++++++++++++
.../collection1/conf/parseContext-vulnerable.xml | 29 ++++
.../solr/collection1/conf/solrconfig.xml | 6 +
.../solr/handler/extraction/CVE202554988Test.java | 164 +++++++++++++++++++++
6 files changed, 348 insertions(+)
diff --git
a/changelog/unreleased/SOLR-17888-mitigate-tika-cve-disable-xfa-parsing.yml
b/changelog/unreleased/SOLR-17888-mitigate-tika-cve-disable-xfa-parsing.yml
new file mode 100644
index 00000000000..656a41512f9
--- /dev/null
+++ b/changelog/unreleased/SOLR-17888-mitigate-tika-cve-disable-xfa-parsing.yml
@@ -0,0 +1,9 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Mitigate CVE-2025-54988 by disabling XFA parsing in PDF documents when
using SolrCell extraction
+type: security # added, changed, fixed, deprecated, removed,
dependency_update, security, other
+authors:
+ - name: Jan Høydahl
+ url: https://home.apache.org/phonebook.html?uid=janhoy
+links:
+ - name: SOLR-17888
+ url: https://issues.apache.org/jira/browse/SOLR-17888
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
index 327fe3e606a..82f8ffbd3f2 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
@@ -29,6 +29,7 @@ import java.util.Map;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.util.SafeXMLParsing;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@@ -41,6 +42,7 @@ public class ParseContextConfig {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final Map<Class<?>, Object> entries = new HashMap<>();
+ private boolean extractAcroFormContentExplicitlySet = false;
/** Creates an empty Config without any settings (used as placeholder). */
public ParseContextConfig() {}
@@ -117,6 +119,12 @@ public class ParseContextConfig {
}
method.invoke(
instance, getValueFromString(propertyDescriptor.getPropertyType(),
propertyValue));
+
+ // Track if extractAcroFormContent was explicitly set
+ if ("org.apache.tika.parser.pdf.PDFParserConfig".equals(className)
+ && "extractAcroFormContent".equals(propertyName)) {
+ extractAcroFormContentExplicitlySet = true;
+ }
}
entries.put(interfaceClass, instance);
@@ -136,10 +144,42 @@ public class ParseContextConfig {
public ParseContext create() {
final ParseContext result = new ParseContext();
+ // Apply user-configured entries first
for (Map.Entry<Class<?>, Object> entry : entries.entrySet()) {
result.set((Class) entry.getKey(), entry.getValue());
}
+ // Apply secure defaults for PDF parsing to mitigate CVE-2025-54988
+ PDFParserConfig pdfConfig = result.get(PDFParserConfig.class);
+
+ if (pdfConfig == null) {
+ // No user config - create secure defaults
+ pdfConfig = new PDFParserConfig();
+ pdfConfig.setExtractAcroFormContent(false);
+ pdfConfig.setIfXFAExtractOnlyXFA(false);
+ result.set(PDFParserConfig.class, pdfConfig);
+ log.debug(
+ "Applied secure PDF parsing defaults: extractAcroFormContent=false
(CVE-2025-54988 mitigation)");
+ } else if (!extractAcroFormContentExplicitlySet) {
+ // User provided PDFParserConfig but did NOT explicitly set
extractAcroFormContent
+ // Apply secure default to protect against CVE-2025-54988
+ pdfConfig.setExtractAcroFormContent(false);
+ pdfConfig.setIfXFAExtractOnlyXFA(false);
+ log.debug(
+ "Applied secure default extractAcroFormContent=false for
CVE-2025-54988 mitigation");
+ } else {
+ // User explicitly set extractAcroFormContent - respect their choice but
warn if vulnerable
+ if (pdfConfig.getExtractAcroFormContent()) {
+ log.warn(
+ "extractAcroFormContent=true is explicitly set, which may be
vulnerable to CVE-2025-54988 XXE attacks. "
+ + "Ensure you trust all PDF sources or disable XFA parsing.");
+ } else {
+ // User explicitly disabled extractAcroFormContent - also disable
ifXFAExtractOnlyXFA for
+ // complete mitigation
+ pdfConfig.setIfXFAExtractOnlyXFA(false);
+ }
+ }
+
return result;
}
}
diff --git
a/solr/modules/extraction/src/test-files/extraction/pdf-with-xfa-xxe.pdf
b/solr/modules/extraction/src/test-files/extraction/pdf-with-xfa-xxe.pdf
new file mode 100644
index 00000000000..66e4c03ea30
--- /dev/null
+++ b/solr/modules/extraction/src/test-files/extraction/pdf-with-xfa-xxe.pdf
@@ -0,0 +1,100 @@
+%PDF-1.4
+1 0 obj
+<<
+/Type /Catalog
+/AcroForm 2 0 R
+/Pages 3 0 R
+>>
+endobj
+
+2 0 obj
+<<
+/XFA [
+ (template) 4 0 R
+]
+>>
+endobj
+
+3 0 obj
+<<
+/Type /Pages
+/Kids [5 0 R]
+/Count 1
+>>
+endobj
+
+4 0 obj
+<<
+/Length 520
+>>
+stream
+<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
+ <subform name="form1">
+ <field name="companyName">
+ <value>
+ <text>ACME Corporation XFA Form Data</text>
+ </value>
+ </field>
+ <field name="secretData">
+ <value>
+ <text>XFA_SENSITIVE_CONTENT_12345</text>
+ </value>
+ </field>
+ <field name="xxeField">
+ <value>
+ <text>POTENTIAL_XXE_ATTACK_VECTOR</text>
+ </value>
+ </field>
+ </subform>
+</template>
+endstream
+endobj
+
+5 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/MediaBox [0 0 612 792]
+/Contents 6 0 R
+/Resources <<
+ /Font <<
+ /F1 <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+ >>
+ >>
+>>
+>>
+endobj
+
+6 0 obj
+<<
+/Length 44
+>>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(Test PDF) Tj
+ET
+endstream
+endobj
+
+xref
+0 7
+0000000000 65535 f
+0000000009 00000 n
+0000000087 00000 n
+0000000140 00000 n
+0000000197 00000 n
+0000000772 00000 n
+0000000961 00000 n
+trailer
+<<
+/Size 7
+/Root 1 0 R
+>>
+startxref
+1054
+%%EOF
diff --git
a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/parseContext-vulnerable.xml
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/parseContext-vulnerable.xml
new file mode 100644
index 00000000000..d2677cb1004
--- /dev/null
+++
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/parseContext-vulnerable.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ Vulnerable parseContext configuration that explicitly ENABLES XFA parsing.
+ This overrides the secure defaults to demonstrate CVE-2025-54988
vulnerability.
+ DO NOT USE IN PRODUCTION.
+-->
+<entries>
+ <entry class="org.apache.tika.parser.pdf.PDFParserConfig"
impl="org.apache.tika.parser.pdf.PDFParserConfig">
+ <property name="extractInlineImages" value="true"/>
+ <!-- VULNERABLE: Explicitly re-enable XFA parsing -->
+ <property name="extractAcroFormContent" value="true"/>
+ </entry>
+</entries>
diff --git
a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
index 80fbc171883..b706684c791 100644
---
a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
@@ -157,6 +157,12 @@
<str
name="tikaserver.metadata.compatibility">${solr.test.tikaserver.metadata.compatibility:false}</str>
</requestHandler>
+ <!-- Handler with vulnerable parseContext for testing CVE-2025-54988 -->
+ <requestHandler name="/update/extract/vulnerable"
class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+ <str name="parseContext.config">parseContext-vulnerable.xml</str>
+ <str name="extraction.backend">local</str>
+ </requestHandler>
+
<requestHandler name="/update/extract/lit-def"
class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
<str name="extraction.backend">${solr.test.extraction.backend:local}</str>
<str name="tikaserver.url">${solr.test.tikaserver.url:}</str>
diff --git
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java
new file mode 100644
index 00000000000..d07e733a7bd
--- /dev/null
+++
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.request.LocalSolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test for CVE-2025-54988 mitigation.
+ *
+ * <p>This test verifies that the default Tika configuration in Solr prevents
XXE (XML External
+ * Entity) injection attacks via crafted XFA (XML Forms Architecture) content
in PDF files.
+ *
+ * <p>CVE-2025-54988 affects Apache Tika versions 1.13 through 3.2.1, allowing
attackers to exploit
+ * XXE vulnerabilities in PDF parsing when XFA forms are processed.
+ *
+ * <p>The mitigation disables XFA parsing by setting:
+ *
+ * <ul>
+ * <li>extractAcroFormContent = false
+ * <li>ifXFAExtractOnlyXFA = false
+ * </ul>
+ */
+public class CVE202554988Test extends SolrTestCaseJ4 {
+ private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig.xml", "schema.xml",
getFile("extraction/solr/collection1").getParent());
+ }
+
+ /**
+ * Test that the default Tika configuration prevents XFA parsing and thus
mitigates
+ * CVE-2025-54988.
+ *
+ * <p>The default configuration should NOT extract XFA content, preventing
XXE attacks.
+ */
+ @Test
+ public void testDefaultConfigPreventsXFAParsing() throws Exception {
+ // Load the PDF with XFA/XXE content using hardened default config
+ loadLocal(
+ "extraction/pdf-with-xfa-xxe.pdf",
+ "literal.id",
+ "doc-default",
+ "fmap.content",
+ "extractedContent",
+ "uprefix",
+ "ignored_");
+
+ assertU(commit());
+
+ // Verify document was indexed
+ assertQ(req("id:doc-default"), "//*[@numFound='1']");
+
+ // Verify the document has extractedContent field
+ assertQ(req("id:doc-default"), "//arr[@name='extractedContent']/str");
+
+ // Verify that XFA form field names are NOT extracted (proves XFA parsing
is disabled)
+ // The PDF contains XFA fields named "companyName", "secretData", and
"xxeField"
+ // These field names should NOT appear when XFA parsing is disabled
+ assertQ(req("id:doc-default AND extractedContent:companyName"),
"//*[@numFound='0']");
+
+ assertQ(req("id:doc-default AND extractedContent:secretData"),
"//*[@numFound='0']");
+
+ assertQ(req("id:doc-default AND extractedContent:xxeField"),
"//*[@numFound='0']");
+ }
+
+ /**
+ * Test with vulnerable parseContext configuration to demonstrate
CVE-2025-54988.
+ *
+ * <p>This test uses a parseContext configuration that explicitly enables
XFA parsing
+ * (extractAcroFormContent=true), which was the vulnerable default before
the fix.
+ *
+ * <p>This test verifies that:
+ *
+ * <ol>
+ * <li>PDFs can still be extracted when XFA parsing is enabled
+ * <li>XFA form field names ARE extracted when extractAcroFormContent=true
+ * <li>This demonstrates the attack vector for CVE-2025-54988
+ * </ol>
+ */
+ @Test
+ public void testVulnerableConfigEnablesXFAParsing() throws Exception {
+ // Load using the /update/extract/vulnerable handler which has
extractAcroFormContent=true
+ loadLocalFromHandler(
+ "/update/extract/vulnerable",
+ "extraction/pdf-with-xfa-xxe.pdf",
+ "literal.id",
+ "doc-vulnerable",
+ "fmap.content",
+ "extractedContent",
+ "uprefix",
+ "ignored_");
+
+ assertU(commit());
+
+ // Verify document was indexed successfully
+ assertQ(req("id:doc-vulnerable"), "//*[@numFound='1']");
+
+ // Verify basic PDF content is extracted (the non-XFA text "Test PDF")
+ assertQ(
+ req("id:doc-vulnerable AND extractedContent:Test AND
extractedContent:PDF"),
+ "//*[@numFound='1']");
+
+ // CRITICAL: With extractAcroFormContent=true, XFA field names ARE
extracted
+ // This proves XFA content is being parsed - the attack vector for
CVE-2025-54988
+ assertQ(req("id:doc-vulnerable AND extractedContent:companyName"),
"//*[@numFound='1']");
+
+ assertQ(req("id:doc-vulnerable AND extractedContent:secretData"),
"//*[@numFound='1']");
+
+ assertQ(req("id:doc-vulnerable AND extractedContent:xxeField"),
"//*[@numFound='1']");
+ }
+
+ /** Helper method to load a file into the default extraction handler. */
+ private SolrQueryResponse loadLocal(String filename, String... args) throws
Exception {
+ return loadLocalFromHandler("/update/extract", filename, args);
+ }
+
+ /** Helper method to load a file into a specific extraction handler. */
+ private SolrQueryResponse loadLocalFromHandler(String handler, String
filename, String... args)
+ throws Exception {
+ LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
+ try {
+ // Create content stream from test file
+ List<ContentStream> cs = new ArrayList<>();
+ cs.add(new ContentStreamBase.FileStream(getFile(filename)));
+ req.setContentStreams(cs);
+
+ // Get handler and process request
+ ExtractingRequestHandler extractHandler =
+ (ExtractingRequestHandler) h.getCore().getRequestHandler(handler);
+ SolrQueryResponse rsp = new SolrQueryResponse();
+ extractHandler.handleRequest(req, rsp);
+
+ return rsp;
+ } finally {
+ req.close();
+ }
+ }
+}