This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch docs/pipes-updates
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e6d9e53c1239aa23b6630a5bd3a5646275e45224
Author: tallison <[email protected]>
AuthorDate: Mon May 11 16:07:05 2026 -0400

    add s3
---
 docs/modules/ROOT/examples/pipes-s3-emitter.json   |   1 +
 docs/modules/ROOT/examples/pipes-s3-fetcher.json   |   1 +
 docs/modules/ROOT/examples/pipes-s3-iterator.json  |   1 +
 docs/modules/ROOT/examples/pipes-s3-pipeline.json  |   1 +
 docs/modules/ROOT/nav.adoc                         |   1 +
 docs/modules/ROOT/pages/pipes/plugins/index.adoc   |   2 +-
 docs/modules/ROOT/pages/pipes/plugins/s3.adoc      | 242 +++++++++++++++++++++
 .../apache/tika/pipes/s3/ConfigExamplesTest.java   | 136 ++++++++++++
 .../test/resources/config-examples/s3-emitter.json |  14 ++
 .../test/resources/config-examples/s3-fetcher.json |  15 ++
 .../resources/config-examples/s3-pipeline.json     |  49 +++++
 .../config-examples/s3-pipes-iterator.json         |  13 ++
 12 files changed, 475 insertions(+), 1 deletion(-)

diff --git a/docs/modules/ROOT/examples/pipes-s3-emitter.json 
b/docs/modules/ROOT/examples/pipes-s3-emitter.json
new file mode 120000
index 0000000000..6f05a73ec2
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-s3-emitter.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-s3-fetcher.json 
b/docs/modules/ROOT/examples/pipes-s3-fetcher.json
new file mode 120000
index 0000000000..b24bd4fa27
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-s3-fetcher.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-s3-iterator.json 
b/docs/modules/ROOT/examples/pipes-s3-iterator.json
new file mode 120000
index 0000000000..db1b210e82
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-s3-iterator.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-s3-pipeline.json 
b/docs/modules/ROOT/examples/pipes-s3-pipeline.json
new file mode 120000
index 0000000000..cc6f573ec2
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-s3-pipeline.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index ef16b190dd..90fce8701d 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -33,6 +33,7 @@
 ** xref:pipes/cpu-sizing.adoc[Forked-JVM CPU Sizing]
 ** xref:pipes/plugins/index.adoc[Plugins]
 *** xref:pipes/plugins/filesystem.adoc[File System]
+*** xref:pipes/plugins/s3.adoc[Amazon S3]
 * xref:configuration/index.adoc[Configuration]
 ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
 ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR]
diff --git a/docs/modules/ROOT/pages/pipes/plugins/index.adoc 
b/docs/modules/ROOT/pages/pipes/plugins/index.adoc
index 8542fa2034..d5173d2032 100644
--- a/docs/modules/ROOT/pages/pipes/plugins/index.adoc
+++ b/docs/modules/ROOT/pages/pipes/plugins/index.adoc
@@ -125,7 +125,7 @@ Many plugins implement more than one (e.g., the S3 plugin 
provides fetcher, emit
 
 == Interface Overviews
 
-For descriptions of the interfaces themselves — their contracts, the shared 
concepts (`FetchKey`, `FetchEmitTuple`, `baseConfig`, etc.), and how they fit 
into a pipeline — see:
+For descriptions of the interfaces themselves — their contracts, the shared 
concepts (`FetchKey`, `FetchEmitTuple`, `fetcherId`/`emitterId` wiring, etc.), 
and how they fit into a pipeline — see:
 
 * xref:pipes/fetchers.adoc[Fetchers]
 * xref:pipes/emitters.adoc[Emitters]
diff --git a/docs/modules/ROOT/pages/pipes/plugins/s3.adoc 
b/docs/modules/ROOT/pages/pipes/plugins/s3.adoc
new file mode 100644
index 0000000000..90d0960f06
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/plugins/s3.adoc
@@ -0,0 +1,242 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Amazon S3 Plugin
+:toc:
+:toclevels: 3
+
+The Amazon S3 plugin (`tika-pipes-s3`) provides fetcher, emitter, and iterator 
interfaces for objects in S3 (or any S3-compatible service such as MinIO).
+
+[cols="2,1,3"]
+|===
+|Interface |Component name |Class
+
+|Fetcher
+|`s3-fetcher`
+|`S3Fetcher`
+
+|Emitter
+|`s3-emitter`
+|`S3Emitter`
+
+|Iterator
+|`s3-pipes-iterator`
+|`S3PipesIterator`
+|===
+
+[#credentials]
+== Credentials
+
+All three components share the same `credentialsProvider` selector:
+
+* `profile` — reads credentials from the local AWS profile named by `profile` 
(e.g., `default`).
+* `instance` — uses the instance/container role attached to the host (EC2 IAM 
role, ECS task role, etc.). No additional fields needed.
+* `key_secret` — reads `accessKey` and `secretKey` from the config. Avoid 
checking these into source control; prefer environment-variable substitution or 
one of the other providers.
+
+The emitter's `validate()` enforces these values, but the fetcher and iterator 
do not — they fail later when the AWS SDK tries to resolve credentials.
+
+[#s3-fetcher]
+== S3 Fetcher (`s3-fetcher`)
+
+Reads objects from an S3 bucket. The fetch key is the S3 key under `prefix` 
(if set).
+
+[source,json]
+----
+include::example$pipes-s3-fetcher.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|S3 bucket name.
+
+|`region`
+|_required_
+|AWS region (e.g., `us-east-1`).
+
+|`prefix`
+|_no default_
+|Optional key prefix. Fetch keys are resolved underneath this prefix.
+
+|`credentialsProvider`
+|_required_
+|One of `profile`, `instance`, `key_secret`. See <<credentials>>.
+
+|`profile` / `accessKey` / `secretKey`
+|_conditional_
+|Required by the matching `credentialsProvider`.
+
+|`spoolToTemp`
+|`true`
+|If `true`, the fetched object is spooled to a temp file before being parsed.
+
+|`extractUserMetadata`
+|`true`
+|If `true`, S3 user-metadata is copied into the parsed `Metadata`.
+
+|`maxConnections`
+|`0`
+|Maximum HTTP connections in the S3 client pool. `0` lets the SDK pick a 
default.
+
+|`maxLength`
+|`-1`
+|Maximum object size, in bytes. `-1` means no limit.
+
+|`endpointConfigurationService`
+|_no default_
+|Custom S3 endpoint, for S3-compatible services such as MinIO or LocalStack.
+
+|`pathStyleAccessEnabled`
+|`false`
+|Force path-style URLs (e.g., `https://endpoint/bucket/key`). Required by some 
S3-compatible services.
+
+|`throttleSeconds`
+|_no default_
+|Optional rate-limit array; consecutive failures sleep for the corresponding 
number of seconds.
+|===
+
+[#s3-emitter]
+== S3 Emitter (`s3-emitter`)
+
+Writes parsed results back to an S3 bucket. The emit key (relative to 
`prefix`) is derived from the `FetchEmitTuple`.
+
+[source,json]
+----
+include::example$pipes-s3-emitter.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|Destination S3 bucket name (validated non-blank).
+
+|`region`
+|_required_
+|AWS region (validated non-blank).
+
+|`credentialsProvider`
+|_required_
+|One of `profile`, `instance`, `key_secret` (validated). See <<credentials>>.
+
+|`profile` / `accessKey` / `secretKey`
+|_conditional_
+|Required by the matching `credentialsProvider` (validated).
+
+|`prefix`
+|_no default_
+|Optional key prefix. A trailing `/` is stripped automatically.
+
+|`fileExtension`
+|`json`
+|Extension appended to each emitted key.
+
+|`spoolToTemp`
+|`true`
+|If `true`, output is spooled locally before being uploaded.
+
+|`maxConnections`
+|`50`
+|Maximum HTTP connections in the S3 client pool.
+
+|`endpointConfigurationService`
+|_no default_
+|Custom S3 endpoint, for S3-compatible services.
+
+|`pathStyleAccessEnabled`
+|`false`
+|Force path-style URLs.
+|===
+
+[#s3-iterator]
+== S3 Iterator (`s3-pipes-iterator`)
+
+Lists objects under a bucket/prefix and emits one `FetchEmitTuple` per object 
found.
+
+[source,json]
+----
+include::example$pipes-s3-iterator.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|S3 bucket to enumerate.
+
+|`region`
+|_required_
+|AWS region.
+
+|`prefix`
+|`""`
+|Key prefix to scope the listing.
+
+|`credentialsProvider`
+|_optional_
+|One of `profile`, `instance`, `key_secret`. See <<credentials>>.
+
+|`profile` / `accessKey` / `secretKey` / `endpointConfigurationService`
+|_conditional_
+|Auth fields, mirroring the fetcher and emitter.
+
+|`fileNamePattern`
+|_no default_
+|Optional regex; only keys whose name matches are emitted.
+
+|`maxConnections`
+|`50`
+|Maximum HTTP connections in the S3 client pool.
+
+|`pathStyleAccessEnabled`
+|`false`
+|Force path-style URLs.
+
+|`fetcherId` / `emitterId`
+|_required_
+|IDs of the fetcher and emitter to bind to each emitted tuple. See 
xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract.
+|===
+
+[#s3-pipeline]
+== Complete Pipeline Example
+
+The example below wires the S3 fetcher, emitter, and iterator into a complete 
pipeline that lists `s3://my-tika-input/incoming/` and writes results to 
`s3://my-tika-output/results/`.
+
+[source,json]
+----
+include::example$pipes-s3-pipeline.json[]
+----
+
+[#notes]
+== Notes
+
+* The fetcher, emitter, and iterator each maintain their own S3 client. Auth 
and endpoint settings need to be configured per component, not globally.
+* The S3 SDK enforces TLS 1.2+ by default; in-flight encryption is on. For 
at-rest encryption, configure bucket-level SSE on the AWS side.
+* When using `endpointConfigurationService` against MinIO or LocalStack, you 
almost always need `pathStyleAccessEnabled: true`.
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java
new file mode 100644
index 0000000000..f248d8194e
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.s3;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.pipes.emitter.s3.S3EmitterConfig;
+import org.apache.tika.pipes.fetcher.s3.config.S3FetcherConfig;
+import org.apache.tika.pipes.iterator.s3.S3PipesIteratorConfig;
+
+/**
+ * Validates S3 fetcher/emitter/iterator configuration examples used in 
documentation.
+ * <p>
+ * The JSON configuration examples are stored in {@code 
src/test/resources/config-examples/}
+ * and are included directly in the AsciiDoc documentation via the {@code 
include::} directive.
+ */
+public class ConfigExamplesTest {
+
+    private static final String EXAMPLES_DIR = "/config-examples/";
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    @TempDir
+    Path tempDir;
+
+    private String readExample(String resourceName) throws Exception {
+        try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + 
resourceName)) {
+            assertNotNull(is, "Resource not found: " + resourceName);
+            return new String(is.readAllBytes(), StandardCharsets.UTF_8);
+        }
+    }
+
+    private void loadViaTikaLoader(String resourceName) throws Exception {
+        String json = readExample(resourceName);
+        Path configFile = tempDir.resolve("tika-config.json");
+        Files.writeString(configFile, json, StandardCharsets.UTF_8);
+        TikaLoader loader = TikaLoader.load(configFile);
+        assertNotNull(loader, "TikaLoader should not be null for: " + 
resourceName);
+    }
+
+    private JsonNode innerComponent(String json, String section, String id, 
String typeName)
+            throws Exception {
+        JsonNode root = OBJECT_MAPPER.readTree(json);
+        JsonNode sectionNode = root.get(section);
+        assertNotNull(sectionNode, "Missing section: " + section);
+        JsonNode idNode = id == null ? sectionNode : sectionNode.get(id);
+        assertNotNull(idNode, "Missing id: " + id);
+        JsonNode typed = idNode.get(typeName);
+        assertNotNull(typed, "Missing type: " + typeName);
+        return typed;
+    }
+
+    @Test
+    public void testS3FetcherConfig() throws Exception {
+        loadViaTikaLoader("s3-fetcher.json");
+
+        JsonNode inner = innerComponent(readExample("s3-fetcher.json"),
+                "fetchers", "s3f", "s3-fetcher");
+        S3FetcherConfig config = S3FetcherConfig.load(inner.toString());
+        assertEquals("my-tika-input", config.getBucket());
+        assertEquals("us-east-1", config.getRegion());
+        assertEquals("profile", config.getCredentialsProvider());
+        assertEquals("default", config.getProfile());
+    }
+
+    @Test
+    public void testS3EmitterConfig() throws Exception {
+        loadViaTikaLoader("s3-emitter.json");
+
+        JsonNode inner = innerComponent(readExample("s3-emitter.json"),
+                "emitters", "s3e", "s3-emitter");
+        S3EmitterConfig config = S3EmitterConfig.load(inner.toString());
+        assertEquals("my-tika-output", config.bucket());
+        assertEquals("us-east-1", config.region());
+        assertEquals("profile", config.credentialsProvider());
+        assertEquals("json", config.fileExtension());
+        // exercises required-field + credentialsProvider whitelist validation
+        config.validate();
+    }
+
+    @Test
+    public void testS3IteratorConfig() throws Exception {
+        loadViaTikaLoader("s3-pipes-iterator.json");
+
+        JsonNode inner = innerComponent(readExample("s3-pipes-iterator.json"),
+                "pipes-iterator", null, "s3-pipes-iterator");
+        S3PipesIteratorConfig config = 
S3PipesIteratorConfig.load(inner.toString());
+        assertEquals("my-tika-input", config.getBucket());
+        assertEquals("us-east-1", config.getRegion());
+        assertEquals("s3f", config.getFetcherId());
+        assertEquals("s3e", config.getEmitterId());
+    }
+
+    @Test
+    public void testS3PipelineConfig() throws Exception {
+        loadViaTikaLoader("s3-pipeline.json");
+
+        String json = readExample("s3-pipeline.json");
+        S3FetcherConfig fetcher = S3FetcherConfig.load(
+                innerComponent(json, "fetchers", "s3f", 
"s3-fetcher").toString());
+        S3EmitterConfig emitter = S3EmitterConfig.load(
+                innerComponent(json, "emitters", "s3e", 
"s3-emitter").toString());
+        S3PipesIteratorConfig iterator = S3PipesIteratorConfig.load(
+                innerComponent(json, "pipes-iterator", null, 
"s3-pipes-iterator").toString());
+
+        emitter.validate();
+        assertEquals(fetcher.getBucket(), iterator.getBucket());
+        assertEquals("s3f", iterator.getFetcherId());
+        assertEquals("s3e", iterator.getEmitterId());
+    }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json
new file mode 100644
index 0000000000..8cd5557db1
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json
@@ -0,0 +1,14 @@
+{
+  "emitters": {
+    "s3e": {
+      "s3-emitter": {
+        "bucket": "my-tika-output",
+        "region": "us-east-1",
+        "prefix": "results/",
+        "fileExtension": "json",
+        "credentialsProvider": "profile",
+        "profile": "default"
+      }
+    }
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json
new file mode 100644
index 0000000000..8047fee2b0
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json
@@ -0,0 +1,15 @@
+{
+  "fetchers": {
+    "s3f": {
+      "s3-fetcher": {
+        "bucket": "my-tika-input",
+        "region": "us-east-1",
+        "prefix": "incoming/",
+        "credentialsProvider": "profile",
+        "profile": "default",
+        "extractUserMetadata": true,
+        "spoolToTemp": true
+      }
+    }
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json
new file mode 100644
index 0000000000..1f17aa7081
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json
@@ -0,0 +1,49 @@
+{
+  "content-handler-factory": {
+    "basic-content-handler-factory": {
+      "type": "TEXT",
+      "writeLimit": -1,
+      "throwOnWriteLimitReached": true
+    }
+  },
+  "fetchers": {
+    "s3f": {
+      "s3-fetcher": {
+        "bucket": "my-tika-input",
+        "region": "us-east-1",
+        "prefix": "incoming/",
+        "credentialsProvider": "profile",
+        "profile": "default",
+        "extractUserMetadata": true
+      }
+    }
+  },
+  "emitters": {
+    "s3e": {
+      "s3-emitter": {
+        "bucket": "my-tika-output",
+        "region": "us-east-1",
+        "prefix": "results/",
+        "fileExtension": "json",
+        "credentialsProvider": "profile",
+        "profile": "default"
+      }
+    }
+  },
+  "pipes-iterator": {
+    "s3-pipes-iterator": {
+      "bucket": "my-tika-input",
+      "region": "us-east-1",
+      "prefix": "incoming/",
+      "credentialsProvider": "profile",
+      "profile": "default",
+      "fetcherId": "s3f",
+      "emitterId": "s3e"
+    }
+  },
+  "pipes": {
+    "parseMode": "RMETA",
+    "onParseException": "EMIT",
+    "numClients": 4
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json
new file mode 100644
index 0000000000..e1fb2e9875
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json
@@ -0,0 +1,13 @@
+{
+  "pipes-iterator": {
+    "s3-pipes-iterator": {
+      "bucket": "my-tika-input",
+      "region": "us-east-1",
+      "prefix": "incoming/",
+      "credentialsProvider": "profile",
+      "profile": "default",
+      "fetcherId": "s3f",
+      "emitterId": "s3e"
+    }
+  }
+}

Reply via email to