Copilot commented on code in PR #3932: URL: https://github.com/apache/solr/pull/3932#discussion_r2604664188
########## solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java: ########## @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.request.LocalSolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test for CVE-2025-54988 mitigation. + * + * <p>This test verifies that the default Tika configuration in Solr prevents XXE (XML External + * Entity) injection attacks via crafted XFA (XML Forms Architecture) content in PDF files. + * + * <p>CVE-2025-54988 affects Apache Tika versions 1.13 through 3.2.1, allowing attackers to exploit + * XXE vulnerabilities in PDF parsing when XFA forms are processed. + * + * <p>The mitigation disables XFA parsing by setting: - extractAcroFormContent = false - + * ifXFAExtractOnlyXFA = false + */ +public class CVE202554988Test extends SolrTestCaseJ4 { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr/collection1").getParent()); + } + + /** + * Test that the default Tika configuration prevents XFA parsing and thus mitigates + * CVE-2025-54988. + * + * <p>The default configuration should NOT extract XFA content, preventing XXE attacks. + */ + @Test + public void testDefaultConfigPreventsXFAParsing() throws Exception { + // Load the PDF with XFA/XXE content using hardened default config + loadLocal( + "extraction/pdf-with-xfa-xxe.pdf", + "literal.id", + "doc-default", + "fmap.content", + "extractedContent", + "uprefix", + "ignored_"); + + assertU(commit()); + + // Verify document was indexed + assertQ(req("id:doc-default"), "//*[@numFound='1']"); + + // Verify the document has extractedContent field + assertQ(req("id:doc-default"), "//arr[@name='extractedContent']/str"); + + // Verify that XFA form field names are NOT extracted (proves XFA parsing is disabled) + // The PDF contains XFA fields named "companyName", "secretData", and "xxeField" + // These field names should NOT appear when XFA parsing is disabled + assertQ(req("id:doc-default AND extractedContent:companyName"), "//*[@numFound='0']"); + + assertQ(req("id:doc-default AND extractedContent:secretData"), "//*[@numFound='0']"); + + assertQ(req("id:doc-default AND extractedContent:xxeField"), "//*[@numFound='0']"); + } + + /** + * Test with vulnerable parseContext configuration to demonstrate CVE-2025-54988. + * + * <p>This test uses a parseContext configuration that explicitly enables XFA parsing + * (extractAcroFormContent=true), which was the vulnerable default before the fix. + * + * <p>This test verifies that: 1. PDFs can still be extracted when XFA parsing is enabled 2. XFA + * form field names ARE extracted when extractAcroFormContent=true 3. This demonstrates the attack + * vector for CVE-2025-54988 Review Comment: The JavaDoc formatting is incorrect. The numbered list is written as a single sentence rather than using proper list formatting. Consider reformatting as: ```java * <p>This test verifies that: * <ol> * <li>PDFs can still be extracted when XFA parsing is enabled</li> * <li>XFA form field names ARE extracted when extractAcroFormContent=true</li> * <li>This demonstrates the attack vector for CVE-2025-54988</li> * </ol> ``` ```suggestion * <p>This test verifies that: * <ol> * <li>PDFs can still be extracted when XFA parsing is enabled</li> * <li>XFA form field names ARE extracted when extractAcroFormContent=true</li> * <li>This demonstrates the attack vector for CVE-2025-54988</li> * </ol> ``` ########## solr/modules/extraction/src/test/org/apache/solr/handler/extraction/CVE202554988Test.java: ########## @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.request.LocalSolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test for CVE-2025-54988 mitigation. + * + * <p>This test verifies that the default Tika configuration in Solr prevents XXE (XML External + * Entity) injection attacks via crafted XFA (XML Forms Architecture) content in PDF files. + * + * <p>CVE-2025-54988 affects Apache Tika versions 1.13 through 3.2.1, allowing attackers to exploit + * XXE vulnerabilities in PDF parsing when XFA forms are processed. + * + * <p>The mitigation disables XFA parsing by setting: - extractAcroFormContent = false - + * ifXFAExtractOnlyXFA = false Review Comment: The comment has inconsistent formatting with a missing space after the dash. The line should read: ``` * <p>The mitigation disables XFA parsing by setting: - extractAcroFormContent = false - * ifXFAExtractOnlyXFA = false ``` Consider reformatting as a proper list: ``` * <p>The mitigation disables XFA parsing by setting: * <ul> * <li>extractAcroFormContent = false</li> * <li>ifXFAExtractOnlyXFA = false</li> * </ul> ``` ```suggestion * <p>The mitigation disables XFA parsing by setting: * <ul> * <li>extractAcroFormContent = false</li> * <li>ifXFAExtractOnlyXFA = false</li> * </ul> ``` ########## solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java: ########## @@ -136,10 +144,38 @@ private Object getValueFromString(Class<?> targetType, String text) { public ParseContext create() { final ParseContext result = new ParseContext(); + // Apply user-configured entries first for (Map.Entry<Class<?>, Object> entry : entries.entrySet()) { result.set((Class) entry.getKey(), entry.getValue()); } + // Apply secure defaults for PDF parsing to mitigate CVE-2025-54988 + PDFParserConfig pdfConfig = result.get(PDFParserConfig.class); + + if (pdfConfig == null) { + // No user config - create secure defaults + pdfConfig = new PDFParserConfig(); + pdfConfig.setExtractAcroFormContent(false); + pdfConfig.setIfXFAExtractOnlyXFA(false); + result.set(PDFParserConfig.class, pdfConfig); + log.debug( + "Applied secure PDF parsing defaults: extractAcroFormContent=false (CVE-2025-54988 mitigation)"); + } else if (!extractAcroFormContentExplicitlySet) { + // User provided PDFParserConfig but did NOT explicitly set extractAcroFormContent + // Apply secure default to protect against CVE-2025-54988 + pdfConfig.setExtractAcroFormContent(false); + pdfConfig.setIfXFAExtractOnlyXFA(false); + log.debug( + "Applied secure default extractAcroFormContent=false for CVE-2025-54988 mitigation"); + } else { + // User explicitly set extractAcroFormContent - respect their choice but warn if vulnerable + if (pdfConfig.getExtractAcroFormContent()) { + log.warn( + "extractAcroFormContent=true is explicitly set, which may be vulnerable to CVE-2025-54988 XXE attacks. " + + "Ensure you trust all PDF sources or disable XFA parsing."); + } + } Review Comment: The `ifXFAExtractOnlyXFA` flag is not set to false when a user explicitly configures `extractAcroFormContent=false`. This could leave the system partially vulnerable if `ifXFAExtractOnlyXFA` defaults to true in the PDFParserConfig. To ensure complete mitigation of CVE-2025-54988, `ifXFAExtractOnlyXFA` should also be set to false in this branch. Consider adding: ```java } else { // User explicitly set extractAcroFormContent - respect their choice but warn if vulnerable if (pdfConfig.getExtractAcroFormContent()) { log.warn( "extractAcroFormContent=true is explicitly set, which may be vulnerable to CVE-2025-54988 XXE attacks. " + "Ensure you trust all PDF sources or disable XFA parsing."); } else { // User explicitly disabled extractAcroFormContent - also disable ifXFAExtractOnlyXFA for complete mitigation pdfConfig.setIfXFAExtractOnlyXFA(false); } } ``` ```suggestion } else { // User explicitly disabled extractAcroFormContent - also disable ifXFAExtractOnlyXFA for complete mitigation pdfConfig.setIfXFAExtractOnlyXFA(false); } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
