This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4637
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c8fff1b6155b4bed19d986ab357bcdf9c860c0d6
Author: tallison <[email protected]>
AuthorDate: Sat Jan 31 19:31:14 2026 -0500

    TIKA-4637 -- update docs
---
 .../ROOT/pages/advanced/setting-limits.adoc        | 59 ++++++++++++++++++++++
 .../configs/cxf-unpack-test-template.json          |  2 +-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/docs/modules/ROOT/pages/advanced/setting-limits.adoc 
b/docs/modules/ROOT/pages/advanced/setting-limits.adoc
index 8064a4d88e..a3af216175 100644
--- a/docs/modules/ROOT/pages/advanced/setting-limits.adoc
+++ b/docs/modules/ROOT/pages/advanced/setting-limits.adoc
@@ -284,6 +284,64 @@ TimeoutLimits limits = TimeoutLimits.get(context);
 
 See test: 
`tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java`
 
+== Embedded Byte Extraction Limits
+
+When extracting embedded document bytes using `ParseMode.UNPACK`, the 
`UnpackConfig` class
+provides safety limits on total bytes extracted. This protects against zip 
bombs and other
+malicious files that may expand to enormous sizes when unpacked.
+
+=== Configuration Options
+
+[cols="2,1,3"]
+|===
+|Setting |Default |Description
+
+|`maxUnpackBytes`
+|10 GB
+|Maximum total bytes to extract from all embedded documents per file. Set to 
-1 for
+unlimited (not recommended for untrusted input).
+|===
+
+=== Behavior
+
+When the byte limit is reached:
+
+* Extraction stops for remaining embedded documents
+* An exception is logged but processing continues
+* Already-extracted bytes are kept
+* The parse result status is `PARSE_SUCCESS_WITH_EXCEPTION`
+
+=== JSON Configuration
+
+[source,json]
+----
+{
+  "parseContext": {
+    "parseMode": "UNPACK",
+    "unpack-config": {
+      "maxUnpackBytes": 104857600
+    }
+  }
+}
+----
+
+This limits extraction to 100 MB total.
+
+=== Java API
+
+[source,java]
+----
+UnpackConfig config = new UnpackConfig();
+config.setMaxUnpackBytes(100 * 1024 * 1024); // 100 MB
+config.setEmitter("my-emitter");
+parseContext.set(UnpackConfig.class, config);
+parseContext.set(ParseMode.class, ParseMode.UNPACK);
+----
+
+For more details on embedded byte extraction configuration, see 
xref:pipes/unpack-config.adoc[Extracting Embedded Bytes].
+
+See tests: 
`tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UnpackModeTest.java`
+
 == Metadata Limits
 
 The `MetadataWriteLimiter` system allows you to constrain metadata size at 
write time,
@@ -407,3 +465,4 @@ if 
("true".equals(metadata.get(TikaCoreProperties.TRUNCATED_METADATA))) {
 
 * xref:advanced/robustness.adoc[Robustness] - Process isolation and fault 
tolerance
 * xref:configuration/index.adoc[Configuration] - General Tika configuration
+* xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] - UnpackConfig for 
byte extraction
diff --git 
a/tika-server/tika-server-core/src/test/resources/configs/cxf-unpack-test-template.json
 
b/tika-server/tika-server-core/src/test/resources/configs/cxf-unpack-test-template.json
index f0d257cd44..f206e15199 100644
--- 
a/tika-server/tika-server-core/src/test/resources/configs/cxf-unpack-test-template.json
+++ 
b/tika-server/tika-server-core/src/test/resources/configs/cxf-unpack-test-template.json
@@ -25,7 +25,7 @@
   "auto-detect-parser": {
     "throwOnZeroBytes": false
   },
-  "other-configs": {
+  "parse-context": {
     "digester-factory": {
       "commons-digester-factory": {
         "digests": [

Reply via email to