(solr) branch branch_9x updated: SOLR-17888 Mitigate CVE-2025-54988 in tika pdf parser (#3932)

janhoy Sun, 14 Dec 2025 09:35:44 -0800

This is an automated email from the ASF dual-hosted git repository.

janhoy pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git



The following commit(s) were added to refs/heads/branch_9x by this push:
     new 22766e82ec5 SOLR-17888 Mitigate CVE-2025-54988 in tika pdf parser 
(#3932)
22766e82ec5 is described below

commit 22766e82ec5950dabe9a30c53e39113e0dfc1a55
Author: Jan Høydahl <[email protected]>
AuthorDate: Sun Dec 14 18:35:27 2025 +0100

    SOLR-17888 Mitigate CVE-2025-54988 in tika pdf parser (#3932)
---
 ...17888-mitigate-tika-cve-disable-xfa-parsing.yml |   9 ++
 .../handler/extraction/ParseContextConfig.java     |  40 +++++
 .../src/test-files/extraction/pdf-with-xfa-xxe.pdf | 100 +++++++++++++
 .../collection1/conf/parseContext-vulnerable.xml   |  29 ++++
 .../solr/collection1/conf/solrconfig.xml           |   6 +
 .../solr/handler/extraction/CVE202554988Test.java  | 164 +++++++++++++++++++++
 6 files changed, 348 insertions(+)

diff --git 
a/changelog/unreleased/SOLR-17888-mitigate-tika-cve-disable-xfa-parsing.yml 
b/changelog/unreleased/SOLR-17888-mitigate-tika-cve-disable-xfa-parsing.yml
new file mode 100644
index 00000000000..656a41512f9
--- /dev/null
+++ b/changelog/unreleased/SOLR-17888-mitigate-tika-cve-disable-xfa-parsing.yml
@@ -0,0 +1,9 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Mitigate CVE-2025-54988 by disabling XFA parsing in PDF documents when 
using SolrCell extraction
+type: security # added, changed, fixed, deprecated, removed, 
dependency_update, security, other
+authors:
+  - name: Jan Høydahl
+    url: https://home.apache.org/phonebook.html?uid=janhoy
+links:
+  - name: SOLR-17888
+    url: https://issues.apache.org/jira/browse/SOLR-17888
diff --git 
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
 
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
index 327fe3e606a..82f8ffbd3f2 100644
--- 
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
+++ 
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
@@ -29,6 +29,7 @@ import java.util.Map;
 import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.util.SafeXMLParsing;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
@@ -41,6 +42,7 @@ public class ParseContextConfig {
   private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   private final Map<Class<?>, Object> entries = new HashMap<>();
+  private boolean extractAcroFormContentExplicitlySet = false;
 
   /** Creates an empty Config without any settings (used as placeholder). */
   public ParseContextConfig() {}
@@ -117,6 +119,12 @@ public class ParseContextConfig {
         }
         method.invoke(
             instance, getValueFromString(propertyDescriptor.getPropertyType(), 
propertyValue));
+
+        // Track if extractAcroFormContent was explicitly set
+        if ("org.apache.tika.parser.pdf.PDFParserConfig".equals(className)
+            && "extractAcroFormContent".equals(propertyName)) {
+          extractAcroFormContentExplicitlySet = true;
+        }
       }
 
       entries.put(interfaceClass, instance);
@@ -136,10 +144,42 @@ public class ParseContextConfig {
   public ParseContext create() {
     final ParseContext result = new ParseContext();
 
+    // Apply user-configured entries first
     for (Map.Entry<Class<?>, Object> entry : entries.entrySet()) {
       result.set((Class) entry.getKey(), entry.getValue());
     }
 
+    // Apply secure defaults for PDF parsing to mitigate CVE-2025-54988
+    PDFParserConfig pdfConfig = result.get(PDFParserConfig.class);
+
+    if (pdfConfig == null) {
+      // No user config - create secure defaults
+      pdfConfig = new PDFParserConfig();
+      pdfConfig.setExtractAcroFormContent(false);
+      pdfConfig.setIfXFAExtractOnlyXFA(false);
+      result.set(PDFParserConfig.class, pdfConfig);
+      log.debug(
+          "Applied secure PDF parsing defaults: extractAcroFormContent=false 
(CVE-2025-54988 mitigation)");
+    } else if (!extractAcroFormContentExplicitlySet) {
+      // User provided PDFParserConfig but did NOT explicitly set 
extractAcroFormContent
+      // Apply secure default to protect against CVE-2025-54988
+      pdfConfig.setExtractAcroFormContent(false);
+      pdfConfig.setIfXFAExtractOnlyXFA(false);
+      log.debug(
+          "Applied secure default extractAcroFormContent=false for 
CVE-2025-54988 mitigation");
+    } else {
+      // User explicitly set extractAcroFormContent - respect their choice but 
warn if vulnerable
+      if (pdfConfig.getExtractAcroFormContent()) {
+        log.warn(
+            "extractAcroFormContent=true is explicitly set, which may be 
vulnerable to CVE-2025-54988 XXE attacks. "
+                + "Ensure you trust all PDF sources or disable XFA parsing.");
+      } else {
+        // User explicitly disabled extractAcroFormContent - also disable 
ifXFAExtractOnlyXFA for
+        // complete mitigation
+        pdfConfig.setIfXFAExtractOnlyXFA(false);
+      }
+    }
+
     return result;
   }
 }
diff --git 
a/solr/modules/extraction/src/test-files/extraction/pdf-with-xfa-xxe.pdf 
b/solr/modules/extraction/src/test-files/extraction/pdf-with-xfa-xxe.pdf
new file mode 100644
index 00000000000..66e4c03ea30
--- /dev/null
+++ b/solr/modules/extraction/src/test-files/extraction/pdf-with-xfa-xxe.pdf
@@ -0,0 +1,100 @@
+%PDF-1.4
+1 0 obj
+<<
+/Type /Catalog
+/AcroForm 2 0 R
+/Pages 3 0 R
+>>
+endobj
+
+2 0 obj
+<<
+/XFA [
+  (template) 4 0 R
+]
+>>
+endobj
+
+3 0 obj
+<<
+/Type /Pages
+/Kids [5 0 R]
+/Count 1
+>>
+endobj
+
+4 0 obj
+<<
+/Length 520
+>>
+stream
+<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/";>
+  <subform name="form1">
+    <field name="companyName">
+      <value>
+        <text>ACME Corporation XFA Form Data</text>
+      </value>
+    </field>
+    <field name="secretData">
+      <value>
+        <text>XFA_SENSITIVE_CONTENT_12345</text>
+      </value>
+    </field>
+    <field name="xxeField">
+      <value>
+        <text>POTENTIAL_XXE_ATTACK_VECTOR</text>
+      </value>
+    </field>
+  </subform>
+</template>
+endstream
+endobj
+
+5 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/MediaBox [0 0 612 792]
+/Contents 6 0 R
+/Resources <<
+  /Font <<
+    /F1 <<
+      /Type /Font
+      /Subtype /Type1
+      /BaseFont /Helvetica
+    >>
+  >>
+>>
+>>
+endobj
+
+6 0 obj
+<<
+/Length 44
+>>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(Test PDF) Tj
+ET
+endstream
+endobj
+
+xref
+0 7
+0000000000 65535 f 
+0000000009 00000 n 
+0000000087 00000 n 
+0000000140 00000 n 
+0000000197 00000 n 
+0000000772 00000 n 
+0000000961 00000 n 
+trailer
+<<
+/Size 7
+/Root 1 0 R
+>>
+startxref
+1054
+%%EOF
diff --git 
a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/parseContext-vulnerable.xml
 
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/parseContext-vulnerable.xml
new file mode 100644
index 00000000000..d2677cb1004
--- /dev/null
+++ 
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/parseContext-vulnerable.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+  Vulnerable parseContext configuration that explicitly ENABLES XFA parsing.
+  This overrides the secure defaults to demonstrate CVE-2025-54988 
vulnerability.
+  DO NOT USE IN PRODUCTION.
+-->
+<entries>
+  <entry class="org.apache.tika.parser.pdf.PDFParserConfig" 
impl="org.apache.tika.parser.pdf.PDFParserConfig">
+    <property name="extractInlineImages" value="true"/>
+    <!-- VULNERABLE: Explicitly re-enable XFA parsing -->
+    <property name="extractAcroFormContent" value="true"/>
+  </entry>
+</entries>
diff --git 
a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
 
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
index 80fbc171883..b706684c791 100644
--- 
a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++ 
b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
@@ -157,6 +157,12 @@
     <str 
name="tikaserver.metadata.compatibility">${solr.test.tikaserver.metadata.compatibility:false}</str>
   </requestHandler>
 
+  <!-- Handler with vulnerable parseContext for testing CVE-2025-54988 -->
+  <requestHandler name="/update/extract/vulnerable" 
class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+    <str name="parseContext.config">parseContext-vulnerable.xml</str>
+    <str name="extraction.backend">local</str>
+  </requestHandler>
+
   <requestHandler name="/update/extract/lit-def" 
class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
     <str name="extraction.backend">${solr.test.extraction.backend:local}</str>
     <str name="tikaserver.url">${solr.test.tikaserver.url:}</str>
diff --git 
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java
 
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java
new file mode 100644
index 00000000000..d07e733a7bd
--- /dev/null
+++ 
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.request.LocalSolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test for CVE-2025-54988 mitigation.
+ *
+ * <p>This test verifies that the default Tika configuration in Solr prevents 
XXE (XML External
+ * Entity) injection attacks via crafted XFA (XML Forms Architecture) content 
in PDF files.
+ *
+ * <p>CVE-2025-54988 affects Apache Tika versions 1.13 through 3.2.1, allowing 
attackers to exploit
+ * XXE vulnerabilities in PDF parsing when XFA forms are processed.
+ *
+ * <p>The mitigation disables XFA parsing by setting:
+ *
+ * <ul>
+ *   <li>extractAcroFormContent = false
+ *   <li>ifXFAExtractOnlyXFA = false
+ * </ul>
+ */
+public class CVE202554988Test extends SolrTestCaseJ4 {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig.xml", "schema.xml", 
getFile("extraction/solr/collection1").getParent());
+  }
+
+  /**
+   * Test that the default Tika configuration prevents XFA parsing and thus 
mitigates
+   * CVE-2025-54988.
+   *
+   * <p>The default configuration should NOT extract XFA content, preventing 
XXE attacks.
+   */
+  @Test
+  public void testDefaultConfigPreventsXFAParsing() throws Exception {
+    // Load the PDF with XFA/XXE content using hardened default config
+    loadLocal(
+        "extraction/pdf-with-xfa-xxe.pdf",
+        "literal.id",
+        "doc-default",
+        "fmap.content",
+        "extractedContent",
+        "uprefix",
+        "ignored_");
+
+    assertU(commit());
+
+    // Verify document was indexed
+    assertQ(req("id:doc-default"), "//*[@numFound='1']");
+
+    // Verify the document has extractedContent field
+    assertQ(req("id:doc-default"), "//arr[@name='extractedContent']/str");
+
+    // Verify that XFA form field names are NOT extracted (proves XFA parsing 
is disabled)
+    // The PDF contains XFA fields named "companyName", "secretData", and 
"xxeField"
+    // These field names should NOT appear when XFA parsing is disabled
+    assertQ(req("id:doc-default AND extractedContent:companyName"), 
"//*[@numFound='0']");
+
+    assertQ(req("id:doc-default AND extractedContent:secretData"), 
"//*[@numFound='0']");
+
+    assertQ(req("id:doc-default AND extractedContent:xxeField"), 
"//*[@numFound='0']");
+  }
+
+  /**
+   * Test with vulnerable parseContext configuration to demonstrate 
CVE-2025-54988.
+   *
+   * <p>This test uses a parseContext configuration that explicitly enables 
XFA parsing
+   * (extractAcroFormContent=true), which was the vulnerable default before 
the fix.
+   *
+   * <p>This test verifies that:
+   *
+   * <ol>
+   *   <li>PDFs can still be extracted when XFA parsing is enabled
+   *   <li>XFA form field names ARE extracted when extractAcroFormContent=true
+   *   <li>This demonstrates the attack vector for CVE-2025-54988
+   * </ol>
+   */
+  @Test
+  public void testVulnerableConfigEnablesXFAParsing() throws Exception {
+    // Load using the /update/extract/vulnerable handler which has 
extractAcroFormContent=true
+    loadLocalFromHandler(
+        "/update/extract/vulnerable",
+        "extraction/pdf-with-xfa-xxe.pdf",
+        "literal.id",
+        "doc-vulnerable",
+        "fmap.content",
+        "extractedContent",
+        "uprefix",
+        "ignored_");
+
+    assertU(commit());
+
+    // Verify document was indexed successfully
+    assertQ(req("id:doc-vulnerable"), "//*[@numFound='1']");
+
+    // Verify basic PDF content is extracted (the non-XFA text "Test PDF")
+    assertQ(
+        req("id:doc-vulnerable AND extractedContent:Test AND 
extractedContent:PDF"),
+        "//*[@numFound='1']");
+
+    // CRITICAL: With extractAcroFormContent=true, XFA field names ARE 
extracted
+    // This proves XFA content is being parsed - the attack vector for 
CVE-2025-54988
+    assertQ(req("id:doc-vulnerable AND extractedContent:companyName"), 
"//*[@numFound='1']");
+
+    assertQ(req("id:doc-vulnerable AND extractedContent:secretData"), 
"//*[@numFound='1']");
+
+    assertQ(req("id:doc-vulnerable AND extractedContent:xxeField"), 
"//*[@numFound='1']");
+  }
+
+  /** Helper method to load a file into the default extraction handler. */
+  private SolrQueryResponse loadLocal(String filename, String... args) throws 
Exception {
+    return loadLocalFromHandler("/update/extract", filename, args);
+  }
+
+  /** Helper method to load a file into a specific extraction handler. */
+  private SolrQueryResponse loadLocalFromHandler(String handler, String 
filename, String... args)
+      throws Exception {
+    LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
+    try {
+      // Create content stream from test file
+      List<ContentStream> cs = new ArrayList<>();
+      cs.add(new ContentStreamBase.FileStream(getFile(filename)));
+      req.setContentStreams(cs);
+
+      // Get handler and process request
+      ExtractingRequestHandler extractHandler =
+          (ExtractingRequestHandler) h.getCore().getRequestHandler(handler);
+      SolrQueryResponse rsp = new SolrQueryResponse();
+      extractHandler.handleRequest(req, rsp);
+
+      return rsp;
+    } finally {
+      req.close();
+    }
+  }
+}

(solr) branch branch_9x updated: SOLR-17888 Mitigate CVE-2025-54988 in tika pdf parser (#3932)

Reply via email to