This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 394c737a47 3x-ooxml-bigdecimal-dos (#2840)
394c737a47 is described below
commit 394c737a470638993dc036bdff759f9faa625384
Author: Tim Allison <[email protected]>
AuthorDate: Wed May 27 10:10:01 2026 -0400
3x-ooxml-bigdecimal-dos (#2840)
bigdecimal issue identified by @tonghuaroot
---
.../parser/microsoft/ooxml/MetadataExtractor.java | 276 +++++++++++++++------
.../microsoft/ooxml/MetadataExtractorTest.java | 205 +++++++++++++++
2 files changed, 409 insertions(+), 72 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 97efe3e186..0a4426cc5f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -16,17 +16,23 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.InputStream;
import java.math.BigDecimal;
import java.util.Date;
import java.util.Optional;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
-import
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
@@ -37,10 +43,12 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import
org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.tika.utils.XMLReaderUtils;
/**
* OOXML metadata extractor.
@@ -51,6 +59,28 @@ import
org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
*/
public class MetadataExtractor {
+ private static final String CUSTOM_PROPERTIES_REL =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";
+
+ /**
+ * Hard cap on the accumulated text-content of a single property element
+ * inside docProps/custom.xml. Real OOXML property values are at most a few
+ * hundred bytes; anything beyond this is either corruption or an attacker
+ * trying to drive memory or CPU pressure (cf. the {@code <vt:decimal>}
+ * BigDecimal DoS where a 1M-digit literal compresses ~1000:1 in deflate).
+ * 64 KB leaves headroom for any legitimate value while bounding the
+ * slow-path inputs decisively.
+ */
+ static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024;
+
+ /**
+ * Hard cap on the {@code <vt:decimal>} text length passed to
+ * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the
+ * digit count, so even a 64 KB string costs noticeable CPU. Real-world
+ * decimal values fit in well under 50 digits; 256 is generous.
+ */
+ static final int MAX_DECIMAL_LENGTH = 256;
+
private final POIXMLTextExtractor extractor;
public MetadataExtractor(POIXMLTextExtractor extractor) {
@@ -65,7 +95,13 @@ public class MetadataExtractor {
extractor instanceof XPSTextExtractor) &&
extractor.getPackage() != null)) {
extractMetadata(extractor.getCoreProperties(), metadata);
extractMetadata(extractor.getExtendedProperties(), metadata);
- extractMetadata(extractor.getCustomProperties(), metadata);
+ // Custom properties are read via SAX directly from the OPC part
+ // rather than through POI/XMLBeans. The XMLBeans path materializes
+ // an attacker-controlled <vt:decimal> through BigDecimal(String),
+ // which is O(n²) on JDK 17 -- a 3 KB crafted carrier with a
+ // 1,000,000-digit literal burns ~25 s of CPU before this method
+ // even returns. See ooxml-bigdecimal-dos.
+ extractCustomPropertiesViaSAX(extractor.getPackage(), metadata);
}
}
@@ -157,85 +193,181 @@ public class MetadataExtractor {
}
}
- private void extractMetadata(POIXMLProperties.CustomProperties properties,
Metadata metadata) {
-
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
props =
- properties.getUnderlyingProperties();
- for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
- CTProperty property = props.getPropertyArray(i);
- String val = null;
- Date date = null;
-
- if (property.isSetLpwstr()) {
- val = property.getLpwstr();
- } else if (property.isSetLpstr()) {
- val = property.getLpstr();
- } else if (property.isSetDate()) {
- date = property.getDate().getTime();
- } else if (property.isSetFiletime()) {
- date = property.getFiletime().getTime();
- } else if (property.isSetBool()) {
- val = Boolean.toString(property.getBool());
+ /**
+ * Parse {@code docProps/custom.xml} directly via SAX, bypassing
+ * POI/XMLBeans. The XMLBeans path materializes an attacker-controlled
+ * {@code <vt:decimal>} through {@link BigDecimal#BigDecimal(String)}
+ * during XML deserialization, which is O(n²) in the digit count on
+ * JDK 17. By reading the part ourselves we can cap both the buffered
+ * text content ({@link #MAX_TEXT_BUFFER_LENGTH}) and the decimal
+ * literal length ({@link #MAX_DECIMAL_LENGTH}) before any slow parse
+ * runs.
+ */
+ private void extractCustomPropertiesViaSAX(OPCPackage opcPackage, Metadata
metadata) {
+ if (opcPackage == null) {
+ return;
+ }
+ try {
+ PackagePart custPart = getRelatedPart(opcPackage,
CUSTOM_PROPERTIES_REL);
+ if (custPart == null) {
+ return;
+ }
+ CustomPropertiesHandler handler = new CustomPropertiesHandler();
+ try (InputStream is = custPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, handler, new ParseContext());
}
+ handler.applyTo(metadata);
+ } catch (Exception e) {
+ //swallow
+ }
+ }
- // Integers
- else if (property.isSetI1()) {
- val = Integer.toString(property.getI1());
- } else if (property.isSetI2()) {
- val = Integer.toString(property.getI2());
- } else if (property.isSetI4()) {
- val = Integer.toString(property.getI4());
- } else if (property.isSetI8()) {
- val = Long.toString(property.getI8());
- } else if (property.isSetInt()) {
- val = Integer.toString(property.getInt());
+ private static PackagePart getRelatedPart(OPCPackage opcPackage, String
relationshipType) {
+ try {
+ PackageRelationshipCollection rels =
+ opcPackage.getRelationshipsByType(relationshipType);
+ if (rels == null || rels.size() == 0) {
+ return null;
}
+ PackageRelationship rel = rels.getRelationship(0);
+ if (rel == null) {
+ return null;
+ }
+ return opcPackage.getPart(rel);
+ } catch (Exception e) {
+ return null;
+ }
+ }
- // Unsigned Integers
- else if (property.isSetUi1()) {
- val = Integer.toString(property.getUi1());
- } else if (property.isSetUi2()) {
- val = Integer.toString(property.getUi2());
- } else if (property.isSetUi4()) {
- val = Long.toString(property.getUi4());
- } else if (property.isSetUi8()) {
- val = property.getUi8().toString();
- } else if (property.isSetUint()) {
- val = Long.toString(property.getUint());
+ /**
+ * Append SAX {@code characters()} content to {@code buf}, but stop
accepting
+ * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+ * silently dropped; truncated values still flow through downstream
parsing.
+ */
+ static void appendCapped(StringBuilder buf, char[] ch, int start, int
length) {
+ if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+ return;
+ }
+ int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+ buf.append(ch, start, Math.min(length, remaining));
+ }
+
+ /**
+ * SAX handler for {@code docProps/custom.xml} (custom properties).
+ * Matches the schema defined by Microsoft's
+ * {@code
http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}
+ * namespace, with value types coming from the {@code vt:} namespace.
+ */
+ static class CustomPropertiesHandler extends DefaultHandler {
+
+ private static final String VT_NS =
+
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";
+
+ private final Metadata customMetadata = new Metadata();
+ private String currentPropertyName;
+ private String currentValueType;
+ private final StringBuilder textBuffer = new StringBuilder();
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts) {
+ if ("property".equals(localName)) {
+ currentPropertyName = atts.getValue("name");
+ currentValueType = null;
+ } else if (VT_NS.equals(uri) && currentPropertyName != null
+ && currentValueType == null) {
+ // Only the direct vt: child of <property> is captured.
+ // Containers like <vt:vector>/<vt:array> latch
currentValueType
+ // here and their scalar children are then ignored, matching
the
+ // prior POI/XMLBeans behavior which skipped vectors/arrays.
+ currentValueType = localName;
+ textBuffer.setLength(0);
}
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ appendCapped(textBuffer, ch, start, length);
+ }
- // Reals
- else if (property.isSetR4()) {
- val = Float.toString(property.getR4());
- } else if (property.isSetR8()) {
- val = Double.toString(property.getR8());
- } else if (property.isSetDecimal()) {
- BigDecimal d = property.getDecimal();
- if (d == null) {
- val = null;
- } else {
- val = d.toPlainString();
+ @Override
+ public void endElement(String uri, String localName, String qName) {
+ if (VT_NS.equals(uri) && currentValueType != null &&
+ localName.equals(currentValueType) && currentPropertyName
!= null) {
+ String raw = textBuffer.toString();
+ String trimmed = raw.trim();
+ String propName = "custom:" + currentPropertyName;
+ switch (currentValueType) {
+ case "lpwstr":
+ case "lpstr":
+ case "bstr":
+ // String values are user-controlled metadata content;
+ // preserve leading/trailing whitespace as the prior
+ // POI getLpwstr()/getLpstr() path did.
+ customMetadata.set(propName, raw);
+ break;
+ case "filetime":
+ case "date":
+ Property tikaProp = Property.externalDate(propName);
+ customMetadata.set(tikaProp, trimmed);
+ break;
+ case "bool":
+ // xs:boolean lexical space allows "1"/"0" alongside
+ // "true"/"false"; the prior POI path emitted
+ // Boolean.toString(...). Preserve that normalization.
+ if ("1".equals(trimmed) ||
"true".equalsIgnoreCase(trimmed)) {
+ customMetadata.set(propName, "true");
+ } else if ("0".equals(trimmed) ||
"false".equalsIgnoreCase(trimmed)) {
+ customMetadata.set(propName, "false");
+ }
+ break;
+ case "i1":
+ case "i2":
+ case "i4":
+ case "int":
+ case "ui1":
+ case "ui2":
+ customMetadata.set(propName, trimmed);
+ break;
+ case "i8":
+ case "ui4":
+ case "ui8":
+ case "uint":
+ customMetadata.set(propName, trimmed);
+ break;
+ case "r4":
+ case "r8":
+ customMetadata.set(propName, trimmed);
+ break;
+ case "decimal":
+ // BigDecimal(String) is O(n²) on JDK 17; cap the input
+ // length to keep an attacker-controlled <vt:decimal>
+ // from burning CPU. Real values are < 50 chars; 256 is
+ // generous. See ooxml-bigdecimal-dos.
+ if (trimmed.length() > MAX_DECIMAL_LENGTH) {
+ break;
+ }
+ try {
+ BigDecimal d = new BigDecimal(trimmed);
+ customMetadata.set(propName, d.toPlainString());
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ break;
+ default:
+ break;
}
- } else if (property.isSetArray()) {
- // TODO Fetch the array values and output
- } else if (property.isSetVector()) {
- // TODO Fetch the vector values and output
- } else if (property.isSetBlob() || property.isSetOblob()) {
- // TODO Decode, if possible
- } else if (property.isSetStream() || property.isSetOstream() ||
- property.isSetVstream()) {
- // TODO Decode, if possible
- } else if (property.isSetStorage() || property.isSetOstorage()) {
- // TODO Decode, if possible
- } else {
- // This type isn't currently supported yet, skip the property
+ currentValueType = null;
+ } else if ("property".equals(localName)) {
+ currentPropertyName = null;
+ currentValueType = null;
}
+ }
- String propName = "custom:" + property.getName();
- if (date != null) {
- Property tikaProp = Property.externalDate(propName);
- metadata.set(tikaProp, date);
- } else if (val != null) {
- metadata.set(propName, val);
+ void applyTo(Metadata metadata) {
+ for (String name : customMetadata.names()) {
+ for (String value : customMetadata.getValues(name)) {
+ metadata.add(name, value);
+ }
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractorTest.java
new file mode 100644
index 0000000000..d941bcfc7b
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractorTest.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.InputSource;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tests for length-cap defenses in {@link MetadataExtractor}'s SAX-based
+ * custom-properties path (backport of the 4.x SAXBasedMetadataExtractor fix).
+ * <p>
+ * A 3 KB OOXML carrier whose {@code <vt:decimal>} contains a 1,000,000-digit
+ * numeric literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)) when reached through POI/XMLBeans.
+ * 3.x's MetadataExtractor now reads {@code docProps/custom.xml} via SAX
+ * directly, bypassing XMLBeans, and rejects decimal literals longer than
+ * {@link MetadataExtractor#MAX_DECIMAL_LENGTH} before constructing BigDecimal.
+ */
+public class MetadataExtractorTest {
+
+ private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+ + "<Properties
xmlns=\"http://schemas.openxmlformats.org/officeDocument"
+ + "/2006/custom-properties\""
+ + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument"
+ + "/2006/docPropsVTypes\">";
+ private static final String CUSTOM_FOOTER = "</Properties>";
+
+ @Test
+ public void appendCappedTruncatesAtLimit() {
+ StringBuilder buf = new StringBuilder();
+ char[] giant = new char[MetadataExtractor.MAX_TEXT_BUFFER_LENGTH +
10_000];
+ java.util.Arrays.fill(giant, '9');
+
+ MetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+ assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(),
+ "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+ // Further appends after the cap are silent no-ops.
+ MetadataExtractor.appendCapped(buf, giant, 0, 100);
+ assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(),
+ "appends past the cap must be silently dropped");
+ }
+
+ @Test
+ public void appendCappedRespectsRemainingRoom() {
+ StringBuilder buf = new StringBuilder();
+ char[] padding = new char[MetadataExtractor.MAX_TEXT_BUFFER_LENGTH -
1];
+ java.util.Arrays.fill(padding, 'x');
+ buf.append(padding);
+
+ MetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 0, 3);
+ assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(),
+ "remaining room (1 char) must be filled; overflow dropped");
+ assertEquals('a', buf.charAt(buf.length() - 1));
+ }
+
+ @Test
+ public void normalDecimalIsExtracted() throws Exception {
+ Metadata m = parseCustomProperties(customProperty("price", "decimal",
"1234.56"));
+ assertEquals("1234.56", m.get("custom:price"));
+ }
+
+ @Test
+ public void oversizedDecimalIsSkippedNotParsed() throws Exception {
+ // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the
+ // attacker's 1M-digit DoS payload. With the cap in place this should
+ // complete in milliseconds and the property should NOT be set.
+ String hugeDigits = "9".repeat(1000);
+ long start = System.nanoTime();
+ Metadata m = parseCustomProperties(customProperty("evil", "decimal",
hugeDigits));
+ long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+ assertNull(m.get("custom:evil"),
+ "oversized decimal must be rejected, not parsed");
+ assertTrue(elapsedMs < 2_000,
+ "parse must complete quickly; took " + elapsedMs + "ms");
+ }
+
+ @Test
+ public void oversizedDecimalAttackPayloadCompletesQuickly() throws
Exception {
+ // Reporter's actual attack shape: 1,000,000 digits. Without the cap
+ // this takes ~25 s on JDK 17. With the cap the SAX read still has to
+ // accumulate the buffer (bounded to 64 KB by appendCapped) and the
+ // decimal-length check then rejects it.
+ String attackDigits = "9".repeat(1_000_000);
+ long start = System.nanoTime();
+ Metadata m = parseCustomProperties(customProperty("evil", "decimal",
attackDigits));
+ long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+ assertNull(m.get("custom:evil"));
+ assertTrue(elapsedMs < 2_000,
+ "1M-digit attack payload must not trigger O(n²) BigDecimal;
took "
+ + elapsedMs + "ms");
+ }
+
+ @Test
+ public void stringValuesPreserveLeadingAndTrailingWhitespace() throws
Exception {
+ // The prior POI/XMLBeans path returned the raw element text for
+ // lpwstr/lpstr/bstr; the SAX path must not trim string content.
+ Metadata m = parseCustomProperties(customProperty("note", "lpwstr", "
hello "));
+ assertEquals(" hello ", m.get("custom:note"));
+ }
+
+ @Test
+ public void boolLexicalOneIsNormalizedToTrue() throws Exception {
+ // xs:boolean allows "1"/"0"; previous code emitted
Boolean.toString(...).
+ Metadata m = parseCustomProperties(customProperty("flag", "bool",
"1"));
+ assertEquals("true", m.get("custom:flag"));
+ }
+
+ @Test
+ public void boolLexicalZeroIsNormalizedToFalse() throws Exception {
+ Metadata m = parseCustomProperties(customProperty("flag", "bool",
"0"));
+ assertEquals("false", m.get("custom:flag"));
+ }
+
+ @Test
+ public void boolLexicalTrueAndFalsePassThrough() throws Exception {
+ assertEquals("true",
+ parseCustomProperties(customProperty("a", "bool",
"true")).get("custom:a"));
+ assertEquals("false",
+ parseCustomProperties(customProperty("b", "bool",
"false")).get("custom:b"));
+ }
+
+ @Test
+ public void vectorContainingScalarIsNotEmittedAsScalar() throws Exception {
+ // Old POI/XMLBeans path explicitly skipped vector/array. The SAX path
+ // must not leak a nested <vt:lpstr> inside <vt:vector> as a scalar.
+ String xml = CUSTOM_HEADER
+ + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\"
pid=\"2\""
+ + " name=\"items\">"
+ + "<vt:vector size=\"2\" baseType=\"lpstr\">"
+ + "<vt:lpstr>foo</vt:lpstr>"
+ + "<vt:lpstr>bar</vt:lpstr>"
+ + "</vt:vector>"
+ + "</property>"
+ + CUSTOM_FOOTER;
+ Metadata m = parseCustomProperties(xml);
+ assertNull(m.get("custom:items"),
+ "vector contents must not be emitted as a scalar custom
property");
+ }
+
+ @Test
+ public void oversizedStringIsTruncatedNotRejected() throws Exception {
+ // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text
+ // accumulation would still be a memory pressure vector. The buffer
+ // cap stops accumulation at 64 KB; the truncated value still flows.
+ String giantString = "a".repeat(200_000);
+ Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr",
giantString));
+ String got = m.get("custom:bigstr");
+ assertNotNull(got, "string-typed property survives truncation");
+ assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, got.length(),
+ "string value must be capped at MAX_TEXT_BUFFER_LENGTH");
+ }
+
+ // ===== helpers =====
+
+ private static String customProperty(String name, String type, String
value) {
+ return CUSTOM_HEADER
+ + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\"
pid=\"2\""
+ + " name=\"" + name + "\">"
+ + "<vt:" + type + ">" + value + "</vt:" + type + ">"
+ + "</property>"
+ + CUSTOM_FOOTER;
+ }
+
+ private static Metadata parseCustomProperties(String xml) throws Exception
{
+ MetadataExtractor.CustomPropertiesHandler handler =
+ new MetadataExtractor.CustomPropertiesHandler();
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(true);
+ SAXParser parser = factory.newSAXParser();
+ parser.parse(new InputSource(new ByteArrayInputStream(
+ xml.getBytes(StandardCharsets.UTF_8))), handler);
+ Metadata metadata = new Metadata();
+ handler.applyTo(metadata);
+ return metadata;
+ }
+}